File size: 11,010 Bytes

19b102a

import numpy as np
import pandas as pd

from PIL import Image
from tqdm import tqdm
from scipy.sparse import csr_matrix
from typing import Mapping, List, Tuple, Union
from transformers.pipelines import Pipeline, pipeline

from bertopic.representation._mmr import mmr
from bertopic.representation._base import BaseRepresentation


class VisualRepresentation(BaseRepresentation):
    """ From a collection of representative documents, extract 
    images to represent topics. These topics are represented by a
    collage of images. 
    
    Arguments:
        nr_repr_images: Number of representative images to extract
        nr_samples: The number of candidate documents to extract per cluster.
        image_height: The height of the resulting collage
        image_square: Whether to resize each image in the collage
                      to a square. This can be visually more appealing
                      if all input images are all almost squares.
        image_to_text_model: The model to caption images.
        batch_size: The number of images to pass to the 
                    `image_to_text_model`. 

    Usage:

    ```python
    from bertopic.representation import VisualRepresentation
    from bertopic import BERTopic

    # The visual representation is typically not a core representation
    # and is advised to pass to BERTopic as an additional aspect.
    # Aspects can be labeled with dictionaries as shown below:
    representation_model = {
        "Visual_Aspect": VisualRepresentation()
    }

    # Use the representation model in BERTopic as a separate aspect
    topic_model = BERTopic(representation_model=representation_model)
    ```
    """
    def __init__(self,
                 nr_repr_images: int = 9,
                 nr_samples: int = 500,
                 image_height: Tuple[int, int] = 600,
                 image_squares: bool = False,
                 image_to_text_model: Union[str, Pipeline] = None,
                 batch_size: int = 32):
        self.nr_repr_images = nr_repr_images
        self.nr_samples = nr_samples
        self.image_height = image_height
        self.image_squares = image_squares

        # Text-to-image model
        if isinstance(image_to_text_model, Pipeline):
            self.image_to_text_model = image_to_text_model
        elif isinstance(image_to_text_model, str):
            self.image_to_text_model = pipeline("image-to-text", model=image_to_text_model)
        elif image_to_text_model is None:
            self.image_to_text_model = None
        else:
            raise ValueError("Please select a correct transformers pipeline. For example:"
                             "pipeline('image-to-text', model='nlpconnect/vit-gpt2-image-captioning')")
        self.batch_size = batch_size

    def extract_topics(self,
                       topic_model,
                       documents: pd.DataFrame,
                       c_tf_idf: csr_matrix,
                       topics: Mapping[str, List[Tuple[str, float]]]
                       ) -> Mapping[str, List[Tuple[str, float]]]:
        """ Extract topics

        Arguments:
            topic_model: A BERTopic model
            documents: All input documents
            c_tf_idf: The topic c-TF-IDF representation
            topics: The candidate topics as calculated with c-TF-IDF

        Returns:
            representative_images: Representative images per topic
        """
        # Extract image ids of most representative documents
        images = documents["Image"].values.tolist()
        (_, _, _, 
         repr_docs_ids) = topic_model._extract_representative_docs(c_tf_idf, 
                                                                   documents, 
                                                                   topics,
                                                                   nr_samples=self.nr_samples,
                                                                   nr_repr_docs=self.nr_repr_images)
        unique_topics = sorted(list(topics.keys()))

        # Combine representative images into a single representation
        representative_images = {}
        for topic in tqdm(unique_topics):
            
            # Get and order represetnative images
            sliced_examplars = repr_docs_ids[topic+topic_model._outliers]
            sliced_examplars = [sliced_examplars[i:i + 3] for i in 
                                range(0, len(sliced_examplars), 3)]
            images_to_combine = [
                    [Image.open(images[index]) if isinstance(images[index], str) 
                     else images[index] for index in sub_indices]
                for sub_indices in sliced_examplars
            ]

            # Concatenate representative images
            representative_image = get_concat_tile_resize(images_to_combine, 
                                                          self.image_height,
                                                          self.image_squares)
            representative_images[topic] = representative_image

            # Make sure to properly close images
            if isinstance(images[0], str):
                for image_list in images_to_combine:
                    for image in image_list:
                        image.close()
        
        return representative_images
    
    def _convert_image_to_text(self,
                               images: List[str],
                               verbose: bool = False) -> List[str]:
        """ Convert a list of images to captions. 

        Arguments:
            images: A list of images or words to be converted to text.
            verbose: Controls the verbosity of the process

        Returns:
            List of captions
        """
        # Batch-wise image conversion
        if self.batch_size is not None:
            documents = []
            for batch in tqdm(self._chunks(images), disable=not verbose):
                outputs = self.image_to_text_model(batch)
                captions = [output[0]["generated_text"] for output in outputs]
                documents.extend(captions)

        # Convert images to text
        else:
            outputs = self.image_to_text_model(images)
            documents = [output[0]["generated_text"] for output in outputs]

        return documents
    
    def image_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:
        """ Convert images to text """
        # Create image topic embeddings
        topics = documents.Topic.values.tolist()
        images = documents.Image.values.tolist()
        df = pd.DataFrame(np.hstack([np.array(topics).reshape(-1, 1), embeddings]))
        image_topic_embeddings = df.groupby(0).mean().values
        
        # Extract image centroids
        image_centroids = {}
        unique_topics = sorted(list(set(topics)))
        for topic, topic_embedding in zip(unique_topics, image_topic_embeddings):
            indices = np.array([index for index, t in enumerate(topics) if t == topic])
            top_n = min([self.nr_repr_images, len(indices)])
            indices = mmr(topic_embedding.reshape(1, -1), embeddings[indices], indices, top_n=top_n, diversity=0.1)
            image_centroids[topic] = indices
            
        # Extract documents
        documents = pd.DataFrame(columns=["Document", "ID", "Topic", "Image"])
        current_id = 0
        for topic, image_ids in tqdm(image_centroids.items()):
            selected_images = [Image.open(images[index]) if isinstance(images[index], str) else images[index] for index in image_ids]
            text = self._convert_image_to_text(selected_images)
            
            for doc, image_id in zip(text, image_ids):
                documents.loc[len(documents), :] = [doc, current_id, topic, images[image_id]]
                current_id += 1
            
            # Properly close images
            if isinstance(images[image_ids[0]], str):
                for image in selected_images:
                    image.close()

        return documents

    def _chunks(self, images):     
        for i in range(0, len(images), self.batch_size):
            yield images[i:i + self.batch_size]


def get_concat_h_multi_resize(im_list):
    """
    Code adapted from: https://note.nkmk.me/en/python-pillow-concat-images/
    """
    min_height = min(im.height for im in im_list)
    min_height = max(im.height for im in im_list)
    im_list_resize = []
    for im in im_list:
        im.resize((int(im.width * min_height / im.height), min_height), resample=0)
        im_list_resize.append(im)

    total_width = sum(im.width for im in im_list_resize)
    dst = Image.new('RGB', (total_width, min_height), (255, 255, 255))
    pos_x = 0
    for im in im_list_resize:
        dst.paste(im, (pos_x, 0))
        pos_x += im.width
    return dst


def get_concat_v_multi_resize(im_list):
    """
    Code adapted from: https://note.nkmk.me/en/python-pillow-concat-images/
    """
    min_width = min(im.width for im in im_list)
    min_width = max(im.width for im in im_list)
    im_list_resize = [im.resize((min_width, int(im.height * min_width / im.width)), resample=0)
                      for im in im_list]
    total_height = sum(im.height for im in im_list_resize)
    dst = Image.new('RGB', (min_width, total_height), (255, 255, 255))
    pos_y = 0
    for im in im_list_resize:
        dst.paste(im, (0, pos_y))
        pos_y += im.height
    return dst


def get_concat_tile_resize(im_list_2d, image_height=600, image_squares=False):
    """
    Code adapted from: https://note.nkmk.me/en/python-pillow-concat-images/
    """
    images = [[image.copy() for image in images] for images in im_list_2d]
    
    # Create 
    if image_squares:
        width =  int(image_height / 3)
        height = int(image_height / 3)
        images = [[image.resize((width, height)) for image in images] for images in im_list_2d]
    
    # Resize images based on minimum size
    else:
        min_width = min([min([img.width for img in imgs]) for imgs in im_list_2d])
        min_height = min([min([img.height for img in imgs]) for imgs in im_list_2d])
        for i, imgs in enumerate(images):
            for j, img in enumerate(imgs):
                if img.height > img.width:
                    images[i][j] = img.resize((int(img.width * min_height / img.height), min_height), resample=0)
                elif img.width > img.height:
                    images[i][j] = img.resize((min_width, int(img.height * min_width / img.width)), resample=0)
                else:
                    images[i][j] = img.resize((min_width, min_width))

    # Resize grid image
    images = [get_concat_h_multi_resize(im_list_h) for im_list_h in images]
    img = get_concat_v_multi_resize(images)
    height_percentage = (image_height/float(img.size[1]))
    adjusted_width = int((float(img.size[0])*float(height_percentage)))
    img = img.resize((adjusted_width, image_height), Image.Resampling.LANCZOS)
    
    return img