Spaces:
Sleeping
Sleeping
fix: prediction coordinates translation
Browse files
app.py
CHANGED
|
@@ -18,6 +18,41 @@ model = VisionEncoderDecoderModel.from_pretrained(pretrained_repo_name, use_auth
|
|
| 18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
model.to(device)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def process_refexp(image: Image, prompt: str):
|
| 23 |
|
|
@@ -89,7 +124,10 @@ def process_refexp(image: Image, prompt: str):
|
|
| 89 |
print(f"image width, height: {width, height}")
|
| 90 |
print(f"processed prompt: {prompt}")
|
| 91 |
|
| 92 |
-
#
|
|
|
|
|
|
|
|
|
|
| 93 |
x = math.floor(width*center_point["x"])
|
| 94 |
y = math.floor(height*center_point["y"])
|
| 95 |
|
|
|
|
| 18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
model.to(device)
|
| 20 |
|
| 21 |
+
def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
|
| 22 |
+
"""
|
| 23 |
+
Convert relative prediction coordinates from resized encoder tensor image
|
| 24 |
+
to original input image size.
|
| 25 |
+
Args:
|
| 26 |
+
original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
|
| 27 |
+
input_image_size: (width, height) tuple
|
| 28 |
+
output_image_size: (width, height) tuple
|
| 29 |
+
"""
|
| 30 |
+
assert point is not None
|
| 31 |
+
assert input_image_size is not None
|
| 32 |
+
assert output_image_size is not None
|
| 33 |
+
# print(f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
|
| 34 |
+
input_width, input_height = input_image_size
|
| 35 |
+
output_width, output_height = output_image_size
|
| 36 |
+
|
| 37 |
+
ratio = min(output_width/input_width, output_height/input_height)
|
| 38 |
+
|
| 39 |
+
resized_height = int(input_height*ratio)
|
| 40 |
+
# print(f'>>> resized_height={resized_height}')
|
| 41 |
+
resized_width = int(input_width*ratio)
|
| 42 |
+
# print(f'>>> resized_width={resized_width}')
|
| 43 |
+
|
| 44 |
+
if resized_height == input_height and resized_width == input_width:
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
# translation of the relative positioning is only needed for dimentions that have padding
|
| 48 |
+
if resized_width < output_width:
|
| 49 |
+
# adjust for padding pixels
|
| 50 |
+
point['x'] *= (output_width / resized_width)
|
| 51 |
+
if resized_height < output_height:
|
| 52 |
+
# adjust for padding pixels
|
| 53 |
+
point['y'] *= (output_height / resized_height)
|
| 54 |
+
# print(f"translated point={point}, resized_image_size: {resized_width, resized_height}")
|
| 55 |
+
|
| 56 |
|
| 57 |
def process_refexp(image: Image, prompt: str):
|
| 58 |
|
|
|
|
| 124 |
print(f"image width, height: {width, height}")
|
| 125 |
print(f"processed prompt: {prompt}")
|
| 126 |
|
| 127 |
+
# convert coordinates from tensor image size to input image size
|
| 128 |
+
out_size = (output_image_size=processor.image_processor.size[1], output_image_size=processor.image_processor.size[0])
|
| 129 |
+
translate_point_coords_from_out_to_in(point=center_point, input_image_size=image.size, output_image_size=out_size)
|
| 130 |
+
|
| 131 |
x = math.floor(width*center_point["x"])
|
| 132 |
y = math.floor(height*center_point["y"])
|
| 133 |
|