Spaces:

not-lain
/

gpu-utils

Paused

App Files Files Community

not-lain commited on Apr 4, 2025

Commit

b9bde42

1 Parent(s): cce19ac

rollback to last stable

Browse files

Files changed (1) hide show

app.py +158 -15

app.py CHANGED Viewed

@@ -3,11 +3,28 @@ import spaces
 import torch
 from loadimg import load_img
 from torchvision import transforms
-from transformers import AutoModelForImageSegmentation
 from diffusers import FluxFillPipeline
 from PIL import Image, ImageOps
-torch.set_float32_matmul_precision(["high", "highest"][0])
 birefnet = AutoModelForImageSegmentation.from_pretrained(
     "ZhengPeng7/BiRefNet", trust_remote_code=True
@@ -22,10 +39,6 @@ transform_image = transforms.Compose(
     ]
 )
-pipe = FluxFillPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16
-).to("cuda")
 def prepare_image_and_mask(
     image,
@@ -110,9 +123,10 @@ def rmbg(image=None, url=None):
     image = load_img(image).convert("RGB")
     image_size = image.size
     input_images = transform_image(image).unsqueeze(0).to("cuda")
-    # Prediction
-    with torch.no_grad():
-        preds = birefnet(input_images)[-1].sigmoid().cpu()
     pred = preds[0].squeeze()
     pred_pil = transforms.ToPILImage()(pred)
     mask = pred_pil.resize(image_size)
@@ -120,7 +134,65 @@ def rmbg(image=None, url=None):
     return image
-@spaces.GPU
 def main(*args):
     api_num = args[0]
     args = args[1:]
@@ -130,12 +202,18 @@ def main(*args):
         return outpaint(*args)
     elif api_num == 3:
         return inpaint(*args)
 rmbg_tab = gr.Interface(
     fn=main,
     inputs=[
-        gr.Number(1, visible=False),
         "image",
         gr.Text("", label="url"),
     ],
@@ -149,7 +227,7 @@ rmbg_tab = gr.Interface(
 outpaint_tab = gr.Interface(
     fn=main,
     inputs=[
-        gr.Number(2, visible=False),
         gr.Image(label="image", type="pil"),
         gr.Number(label="padding top"),
         gr.Number(label="padding bottom"),
@@ -169,7 +247,7 @@ outpaint_tab = gr.Interface(
 inpaint_tab = gr.Interface(
     fn=main,
     inputs=[
-        gr.Number(3, visible=False),
         gr.Image(label="image", type="pil"),
         gr.Image(label="mask", type="pil"),
         gr.Text(label="prompt"),
@@ -183,9 +261,74 @@ inpaint_tab = gr.Interface(
     description="it is recommended that you use https://github.com/la-voliere/react-mask-editor when creating an image mask in JS and then inverse it before sending it to this space",
 )
 demo = gr.TabbedInterface(
-    [rmbg_tab, outpaint_tab, inpaint_tab],
-    ["remove background", "outpainting", "inpainting"],
     title="Utilities that require GPU",
 )

 import torch
 from loadimg import load_img
 from torchvision import transforms
+from transformers import AutoModelForImageSegmentation, pipeline
 from diffusers import FluxFillPipeline
 from PIL import Image, ImageOps
+# from sam2.sam2_image_predictor import SAM2ImagePredictor
+import numpy as np
+from simple_lama_inpainting import SimpleLama
+from contextlib import contextmanager
+@contextmanager
+def float32_high_matmul_precision():
+    torch.set_float32_matmul_precision("high")
+    try:
+        yield
+    finally:
+        torch.set_float32_matmul_precision("highest")
+pipe = FluxFillPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16
+).to("cuda")
 birefnet = AutoModelForImageSegmentation.from_pretrained(
     "ZhengPeng7/BiRefNet", trust_remote_code=True
     ]
 )
 def prepare_image_and_mask(
     image,
     image = load_img(image).convert("RGB")
     image_size = image.size
     input_images = transform_image(image).unsqueeze(0).to("cuda")
+    with float32_high_matmul_precision():
+        # Prediction
+        with torch.no_grad():
+            preds = birefnet(input_images)[-1].sigmoid().cpu()
     pred = preds[0].squeeze()
     pred_pil = transforms.ToPILImage()(pred)
     mask = pred_pil.resize(image_size)
     return image
+# def mask_generation(image=None, d=None):
+#     # use bfloat16 for the entire notebook
+#     # torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
+#     # # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+#     # if torch.cuda.get_device_properties(0).major >= 8:
+#     #     torch.backends.cuda.matmul.allow_tf32 = True
+#     #     torch.backends.cudnn.allow_tf32 = True
+#     d = eval(d)  # convert this to dictionary
+#     with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+#         predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2.1-hiera-large")
+#         predictor.set_image(image)
+#         input_point = np.array(d["input_points"])
+#         input_label = np.array(d["input_labels"])
+#         masks, scores, logits = predictor.predict(
+#             point_coords=input_point,
+#             point_labels=input_label,
+#             multimask_output=True,
+#         )
+#     sorted_ind = np.argsort(scores)[::-1]
+#     masks = masks[sorted_ind]
+#     scores = scores[sorted_ind]
+#     logits = logits[sorted_ind]
+#     out = []
+#     for i in range(len(masks)):
+#         m = Image.fromarray(masks[i] * 255).convert("L")
+#         comp = Image.composite(image, m, m)
+#         out.append((comp, f"image {i}"))
+#     return out
+def erase(image=None, mask=None):
+    simple_lama = SimpleLama()
+    image = load_img(image)
+    mask = load_img(mask).convert("L")
+    return simple_lama(image, mask)
+# Initialize Whisper model
+whisper = pipeline(
+    task="automatic-speech-recognition",
+    model="openai/whisper-large-v3",
+    chunk_length_s=30,
+    device="cuda" if torch.cuda.is_available() else "cpu",
+)
+def transcribe(audio, task="transcribe"):
+    if audio is None:
+        raise gr.Error("No audio file submitted!")
+    text = whisper(
+        audio, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True
+    )["text"]
+    return text
+@spaces.GPU(duration=120)
 def main(*args):
     api_num = args[0]
     args = args[1:]
         return outpaint(*args)
     elif api_num == 3:
         return inpaint(*args)
+    # elif api_num == 4:
+    #     return mask_generation(*args)
+    elif api_num == 5:
+        return erase(*args)
+    elif api_num == 6:
+        return transcribe(*args)
 rmbg_tab = gr.Interface(
     fn=main,
     inputs=[
+        gr.Number(1, interactive=False),
         "image",
         gr.Text("", label="url"),
     ],
 outpaint_tab = gr.Interface(
     fn=main,
     inputs=[
+        gr.Number(2, interactive=False),
         gr.Image(label="image", type="pil"),
         gr.Number(label="padding top"),
         gr.Number(label="padding bottom"),
 inpaint_tab = gr.Interface(
     fn=main,
     inputs=[
+        gr.Number(3, interactive=False),
         gr.Image(label="image", type="pil"),
         gr.Image(label="mask", type="pil"),
         gr.Text(label="prompt"),
     description="it is recommended that you use https://github.com/la-voliere/react-mask-editor when creating an image mask in JS and then inverse it before sending it to this space",
 )
+# sam2_tab = gr.Interface(
+#     main,
+#     inputs=[
+#         gr.Number(4, interactive=False),
+#         gr.Image(type="pil"),
+#         gr.Text(),
+#     ],
+#     outputs=gr.Gallery(),
+#     examples=[
+#         [
+#             4,
+#             "./assets/truck.jpg",
+#             '{"input_points": [[500, 375], [1125, 625]], "input_labels": [1, 0]}',
+#         ]
+#     ],
+#     api_name="sam2",
+#     cache_examples=False,
+# )
+erase_tab = gr.Interface(
+    main,
+    inputs=[
+        gr.Number(5, interactive=False),
+        gr.Image(type="pil"),
+        gr.Image(type="pil"),
+    ],
+    outputs=gr.Image(),
+    examples=[
+        [
+            5,
+            "./assets/rocket.png",
+            "./assets/Inpainting mask.png",
+        ]
+    ],
+    api_name="erase",
+    cache_examples=False,
+)
+transcribe_tab = gr.Interface(
+    fn=main,
+    inputs=[
+        gr.Number(6, interactive=False),
+        gr.Audio(type="filepath"),
+        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
+    ],
+    outputs="text",
+    api_name="transcribe",
+    description="Upload an audio file to extract text using Whisper Large V3",
+)
 demo = gr.TabbedInterface(
+    [
+        rmbg_tab,
+        outpaint_tab,
+        inpaint_tab,
+        #  sam2_tab,
+        erase_tab,
+        transcribe_tab,
+    ],
+    [
+        "remove background",
+        "outpainting",
+        "inpainting",
+        #  "sam2",
+        "erase",
+        "transcribe",
+    ],
     title="Utilities that require GPU",
 )