Spaces:

TencentARC
/

Caption-Anything

Runtime error

App Files Files Community

zjr commited on Apr 16, 2023

Commit

2461d7d

1 Parent(s): 388219c

Update UI

Browse files

Files changed (2) hide show

app.py +5 -3
image_editing_utils.py +54 -8

app.py CHANGED Viewed

@@ -59,7 +59,7 @@ ckpt_url_map = {
     'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
     'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'
 }
-os.makedirs('result', exist_ok=True)
 args = parse_augment()
 checkpoint_url = ckpt_url_map[seg_model_map[args.segmenter]]
@@ -202,6 +202,8 @@ def inference_seg_cap(image_input, point_prompt, click_mode, enable_wiki, langua
     # chat_input = click_coordinate
     prompt = get_prompt(coordinate, click_state, click_mode)
     print('prompt: ', prompt, 'controls: ', controls)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)
@@ -218,7 +220,7 @@ def inference_seg_cap(image_input, point_prompt, click_mode, enable_wiki, langua
     input_mask = np.array(out['mask'].convert('P'))
     image_input = mask_painter(np.array(image_input), input_mask)
     origin_image_input = image_input
-    image_input = create_bubble_frame(image_input, text, (evt.index[0], evt.index[1]))
     yield state, state, click_state, chat_input, image_input, wiki
     if not args.disable_gpt and model.text_refiner:
@@ -227,7 +229,7 @@ def inference_seg_cap(image_input, point_prompt, click_mode, enable_wiki, langua
         new_cap = refined_caption['caption']
         wiki = refined_caption['wiki']
         state = state + [(None, f"caption: {new_cap}")]
-        refined_image_input = create_bubble_frame(origin_image_input, new_cap, (evt.index[0], evt.index[1]))
         yield state, state, click_state, chat_input, refined_image_input, wiki

     'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
     'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'
 }
 args = parse_augment()
 checkpoint_url = ckpt_url_map[seg_model_map[args.segmenter]]
     # chat_input = click_coordinate
     prompt = get_prompt(coordinate, click_state, click_mode)
     print('prompt: ', prompt, 'controls: ', controls)
+    input_points = prompt['input_point']
+    input_labels = prompt['input_label']
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)
     input_mask = np.array(out['mask'].convert('P'))
     image_input = mask_painter(np.array(image_input), input_mask)
     origin_image_input = image_input
+    image_input = create_bubble_frame(image_input, text, (evt.index[0], evt.index[1]), input_mask, input_points=input_points, input_labels=input_labels)
     yield state, state, click_state, chat_input, image_input, wiki
     if not args.disable_gpt and model.text_refiner:
         new_cap = refined_caption['caption']
         wiki = refined_caption['wiki']
         state = state + [(None, f"caption: {new_cap}")]
+        refined_image_input = create_bubble_frame(origin_image_input, new_cap, (evt.index[0], evt.index[1]), input_mask, input_points=input_points, input_labels=input_labels)
         yield state, state, click_state, chat_input, refined_image_input, wiki

image_editing_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from PIL import Image, ImageDraw, ImageFont
 import copy
-import numpy as np
 def wrap_text(text, font, max_width):
     lines = []
@@ -17,7 +18,7 @@ def wrap_text(text, font, max_width):
     lines.append(current_line)
     return lines
-def create_bubble_frame(image, text, point, font_path='DejaVuSansCondensed-Bold.ttf', font_size_ratio=0.025):
     # Load the image
     if type(image) == np.ndarray:
         image = Image.fromarray(image)
@@ -29,6 +30,7 @@ def create_bubble_frame(image, text, point, font_path='DejaVuSansCondensed-Bold.
     total_chars = len(text)
     max_text_width = int(0.4 * width)
     font_size = int(height * font_size_ratio)
     # Load the font
     font = ImageFont.truetype(font_path, font_size)
@@ -45,20 +47,33 @@ def create_bubble_frame(image, text, point, font_path='DejaVuSansCondensed-Bold.
     bubble_height = text_height + 2 * padding
     # Create a new image for the bubble frame
-    bubble = Image.new('RGBA', (bubble_width, bubble_height), (255, 255, 255, 0))
     # Draw the bubble frame on the new image
     draw = ImageDraw.Draw(bubble)
     # draw.rectangle([(0, 0), (bubble_width - 1, bubble_height - 1)], fill=(255, 255, 255, 0), outline=(255, 255, 255, 0), width=2)
     # Draw the wrapped text line by line
     y_text = padding
     for line in lines:
-        draw.text((padding, y_text), line, font=font, fill=(255, 255, 255, 255))
         y_text += font.getsize(line)[1]
     # Calculate the bubble frame position
-    x, y = point
     if x + bubble_width > width:
         x = width - bubble_width
     if y + bubble_height > height:
@@ -66,4 +81,35 @@ def create_bubble_frame(image, text, point, font_path='DejaVuSansCondensed-Bold.
     # Paste the bubble frame onto the image
     image.paste(bubble, (x, y), bubble)
-    return image

 from PIL import Image, ImageDraw, ImageFont
 import copy
+import numpy as np
+import cv2
 def wrap_text(text, font, max_width):
     lines = []
     lines.append(current_line)
     return lines
+def create_bubble_frame(image, text, point, segmask, input_points, input_labels, font_path='times_with_simsun.ttf', font_size_ratio=0.033, point_size_ratio=0.01):
     # Load the image
     if type(image) == np.ndarray:
         image = Image.fromarray(image)
     total_chars = len(text)
     max_text_width = int(0.4 * width)
     font_size = int(height * font_size_ratio)
+    point_size = max(int(height * point_size_ratio), 1)
     # Load the font
     font = ImageFont.truetype(font_path, font_size)
     bubble_height = text_height + 2 * padding
     # Create a new image for the bubble frame
+    bubble = Image.new('RGBA', (bubble_width, bubble_height), (255,248, 220, 0))
     # Draw the bubble frame on the new image
     draw = ImageDraw.Draw(bubble)
     # draw.rectangle([(0, 0), (bubble_width - 1, bubble_height - 1)], fill=(255, 255, 255, 0), outline=(255, 255, 255, 0), width=2)
+    draw_rounded_rectangle(draw, (0, 0, bubble_width - 1, bubble_height - 1), point_size * 2,
+                        fill=(255,248, 220, 120), outline=None, width=2)
     # Draw the wrapped text line by line
     y_text = padding
     for line in lines:
+        draw.text((padding, y_text), line, font=font, fill=(0, 0, 0, 255))
         y_text += font.getsize(line)[1]
+    # Determine the point by the min area rect of mask
+    try:
+        ret, thresh = cv2.threshold(segmask, 127, 255, 0)
+        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        largest_contour = max(contours, key=cv2.contourArea)
+        min_area_rect = cv2.minAreaRect(largest_contour)
+        box = cv2.boxPoints(min_area_rect)
+        sorted_points = box[np.argsort(box[:, 0])]
+        right_most_points = sorted_points[-2:]
+        right_down_most_point = right_most_points[np.argsort(right_most_points[:, 1])][-1]
+        x, y = int(right_down_most_point[0]), int(right_down_most_point[1])
+    except:
+        x, y = point
     # Calculate the bubble frame position
     if x + bubble_width > width:
         x = width - bubble_width
     if y + bubble_height > height:
     # Paste the bubble frame onto the image
     image.paste(bubble, (x, y), bubble)
+    draw = ImageDraw.Draw(image)
+    colors = [(0, 191, 255, 255), (255, 106, 106, 255)]
+    for p, label in zip(input_points, input_labels):
+        point_x, point_y = p[0], p[1]
+        left = point_x - point_size
+        top = point_y - point_size
+        right = point_x + point_size
+        bottom = point_y + point_size
+        draw.ellipse((left, top, right, bottom), fill=colors[label])
+    return image
+def draw_rounded_rectangle(draw, xy, corner_radius, fill=None, outline=None, width=1):
+    x1, y1, x2, y2 = xy
+    draw.rectangle(
+        (x1, y1 + corner_radius, x2, y2 - corner_radius),
+        fill=fill,
+        outline=outline,
+        width=width
+    )
+    draw.rectangle(
+        (x1 + corner_radius, y1, x2 - corner_radius, y2),
+        fill=fill,
+        outline=outline,
+        width=width
+    )
+    draw.pieslice((x1, y1, x1 + corner_radius * 2, y1 + corner_radius * 2), 180, 270, fill=fill, outline=outline, width=width)
+    draw.pieslice((x2 - corner_radius * 2, y1, x2, y1 + corner_radius * 2), 270, 360, fill=fill, outline=outline, width=width)
+    draw.pieslice((x2 - corner_radius * 2, y2 - corner_radius * 2, x2, y2), 0, 90, fill=fill, outline=outline, width=width)
+    draw.pieslice((x1, y2 - corner_radius * 2, x1 + corner_radius * 2, y2), 90, 180, fill=fill, outline=outline, width=width)