File size: 2,201 Bytes
10f0dc2
 
 
0352ab9
 
10f0dc2
83db32b
 
 
10f0dc2
 
 
 
 
 
 
 
 
 
83db32b
0352ab9
83db32b
0352ab9
10f0dc2
 
b6eb957
0352ab9
10f0dc2
0352ab9
10f0dc2
83db32b
10f0dc2
 
 
 
 
 
 
b6eb957
0352ab9
 
 
b6eb957
83db32b
10f0dc2
b6eb957
 
10f0dc2
83db32b
 
10f0dc2
83db32b
 
10f0dc2
0352ab9
10f0dc2
b6eb957
50ee613
 
83db32b
 
10f0dc2
83db32b
10f0dc2
b6eb957
0352ab9
10f0dc2
b6eb957
0352ab9
83db32b
0352ab9
83db32b
 
 
 
 
10f0dc2
 
 
f94528a
83db32b
0352ab9
83db32b
 
50ee613
b6eb957
10f0dc2
0352ab9
10f0dc2
 
b6eb957
d01d490
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import torch
import gradio as gr
from transformers import Owlv2Processor, Owlv2ForObjectDetection
import cv2
import spaces

# ===============================
# DEVICE
# ===============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Owlv2ForObjectDetection.from_pretrained(
    "google/owlv2-base-patch16-ensemble"
).to(device)

processor = Owlv2Processor.from_pretrained(
    "google/owlv2-base-patch16-ensemble"
)

# ===============================
# MAIN FUNCTION
# ===============================
@spaces.GPU
def query_image(img, text_queries, score_threshold):

    # Split queries (still required internally)
    text_queries = text_queries.split(",")

    # Prepare inputs
    inputs = processor(
        text=text_queries,
        images=img,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Move outputs to CPU
    outputs.logits = outputs.logits.cpu()
    outputs.pred_boxes = outputs.pred_boxes.cpu()

    # Correct target size (IMPORTANT)
    target_sizes = torch.tensor([img.shape[:2]])

    # ✅ FIXED METHOD (important!)
    results = processor.post_process_grounded_object_detection(
        outputs=outputs,
        target_sizes=target_sizes
    )[0]

    boxes = results["boxes"]
    scores = results["scores"]

    output_boxes = []

    # Process detections
    for box, score in zip(boxes, scores):

        if score < score_threshold:
            continue

        x1, y1, x2, y2 = map(int, box.tolist())

        # Save ONLY coordinates
        output_boxes.append([x1, y1, x2, y2])

        # Draw rectangle ONLY (no labels)
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)

    return img, output_boxes


# ===============================
# GRADIO UI
# ===============================
demo = gr.Interface(
    fn=query_image,
    inputs=[
        gr.Image(type="numpy"),
        gr.Textbox(label="Classes (comma separated)"),
        gr.Slider(0, 1, value=0.1)
    ],
    outputs=[
        gr.Image(label="Bounding Boxes"),
        gr.JSON(label="Coordinates Only")
    ],
    title="OWLv2 Bounding Box Coordinates Only"
)

# Launch app
demo.launch()