Spaces:

HarshitX
/

Multi_LLM_Image_Captioning

Sleeping

File size: 6,025 Bytes

8a8f3ed

import os
import cv2
import numpy as np

from PIL import Image, ImageDraw, ImageFont

class ImageCaptionOverlay:
    """Handles adding captions to images using OpenCV"""
    
    @staticmethod
    def add_caption_overlay(image: np.ndarray, caption: str, position: str = "bottom", 

    font_size: int = 1, thickness: int = 2) -> np.ndarray:
        """Add caption as overlay on the image"""
        img_copy = image.copy()
        height, width = img_copy.shape[:2]
        
        # Prepare text
        font = cv2.FONT_HERSHEY_SIMPLEX
        
        # Calculate text size and position
        text_size = cv2.getTextSize(caption, font, font_size, thickness)[0]
        
        # Wrap text if too long
        max_width = width - 40
        if text_size[0] > max_width:
            words = caption.split()
            lines = []
            current_line = ""
            
            for word in words:
                test_line = current_line + " " + word if current_line else word
                test_size = cv2.getTextSize(test_line, font, font_size, thickness)[0]
                
                if test_size[0] <= max_width:
                    current_line = test_line
                else:
                    if current_line:
                        lines.append(current_line)
                    current_line = word
            
            if current_line:
                lines.append(current_line)
        else:
            lines = [caption]
        
        # Calculate positions
        line_height = cv2.getTextSize("A", font, font_size, thickness)[0][1] + 10
        total_height = len(lines) * line_height
        
        if position == "bottom":
            start_y = height - total_height - 20
        elif position == "top":
            start_y = 30
        else:  # center
            start_y = (height - total_height) // 2
        
        # Add background rectangle for better readability
        for i, line in enumerate(lines):
            text_size = cv2.getTextSize(line, font, font_size, thickness)[0]
            text_x = (width - text_size[0]) // 2
            text_y = start_y + (i * line_height) + text_size[1]
            
            # Background rectangle
            cv2.rectangle(img_copy, 
                         (text_x - 10, text_y - text_size[1] - 5),
                         (text_x + text_size[0] + 10, text_y + 5),
                         (0, 0, 0), -1)
            
            # Text
            cv2.putText(img_copy, line, (text_x, text_y), font, font_size, (255, 255, 255), thickness)
        
        return img_copy
    
    @staticmethod
    def add_caption_background(image: np.ndarray, caption: str, 

                             font_path: str = None,

                             background_color: tuple = (0, 0, 0),

                             text_color: tuple = (255, 255, 255), 

                             margin: int = 50) -> np.ndarray:
        """Add caption on a background behind the image"""
        height, width = image.shape[:2]
        
        # Use PIL for better text rendering
        pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        
        # Try to use Poppins font first, then fallback to default
        try:
            # First priority: custom font path if provided
            if font_path and os.path.exists(font_path):
                font = ImageFont.truetype(font_path, 24)
            # Second priority: check for Poppins font in fonts directory
            elif os.path.exists("fonts/Poppins-Regular.ttf"):
                font = ImageFont.truetype("fonts/Poppins-Regular.ttf", 24)
            else:
                # Fallback to default font
                font = ImageFont.load_default()
        except Exception:
            # If anything fails, use default font
            font = ImageFont.load_default()
        
        # Calculate text dimensions
        draw = ImageDraw.Draw(pil_image)
        bbox = draw.textbbox((0, 0), caption, font=font)
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]
        
        # Wrap text if necessary
        max_width = width - (2 * margin)
        if text_width > max_width:
            words = caption.split()
            lines = []
            current_line = ""
            
            for word in words:
                test_line = current_line + " " + word if current_line else word
                test_bbox = draw.textbbox((0, 0), test_line, font=font)
                test_width = test_bbox[2] - test_bbox[0]
                
                if test_width <= max_width:
                    current_line = test_line
                else:
                    if current_line:
                        lines.append(current_line)
                    current_line = word
            
            if current_line:
                lines.append(current_line)
        else:
            lines = [caption]
        
        # Calculate total text height
        total_text_height = len(lines) * text_height + (len(lines) - 1) * 10
        
        # Create new image with space for text
        new_height = height + total_text_height + (2 * margin)
        new_image = Image.new('RGB', (width, new_height), background_color)
        
        # Paste original image
        new_image.paste(pil_image, (0, total_text_height + (2 * margin)))
        
        # Add text
        draw = ImageDraw.Draw(new_image)
        y_offset = margin
        
        for line in lines:
            bbox = draw.textbbox((0, 0), line, font=font)
            line_width = bbox[2] - bbox[0]
            x_position = (width - line_width) // 2
            
            draw.text((x_position, y_offset), line, fill=text_color, font=font)
            y_offset += text_height + 10
        
        # Convert back to OpenCV format
        return cv2.cvtColor(np.array(new_image), cv2.COLOR_RGB2BGR)