File size: 1,301 Bytes
1f282bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import numpy as np
from PIL import Image
import base64
import io
def process_image_with_models(
image: Image.Image,
models: dict,
box_threshold: float = 0.05,
iou_threshold: float = 0.1
) -> tuple:
"""Process image with YOLO and captioning models."""
# Convert PIL Image to numpy array
img_array = np.array(image)
# Run YOLO detection
results = models['yolo_model'](img_array)
# Get bounding boxes and labels
boxes = results[0].boxes
coordinates = boxes.xyxy.cpu().numpy().tolist()
# Process with caption model
inputs = models['processor'](images=image, return_tensors="pt")
outputs = models['caption_model'].generate(
**inputs,
max_length=50,
num_beams=5,
early_stopping=True
)
# Decode captions
captions = models['processor'].batch_decode(outputs, skip_special_tokens=True)
# Create labeled image
img_with_boxes = results[0].plot()
# Convert numpy array to PIL Image and then to base64
labeled_img = Image.fromarray(img_with_boxes)
buffered = io.BytesIO()
labeled_img.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue())
return img_str, coordinates, captions
|