Spaces:

ClickMons
/

CreativUp_Demo

Sleeping

File size: 5,872 Bytes

c3c0d39

from .utils import str_to_date, str_to_list
import logging
import pandas as pd
import re

COL_NAMES = [
    'database_id',
    'museum_id',
    'art_piece_place',
    'art_piece_type',
    'art_piece_date',
    'related_names',
    'related_dates',
    'related_places',
    'keywords',
    'related_emotions',
    'explanation',
    'question',
    'name_image' # à implémenter
]


class Database:
    """
    A database, backed by Pandas.
    """

    def __init__(self, csv):
        """
        Creates a new database given a CSV file which separator is ";".
        """
        self.dataframe = pd.read_csv(
            csv, sep=";", names=COL_NAMES, usecols=range(0, len(COL_NAMES)), na_filter=False)
        self.normalize_fields()

    def get_dataframe(self):
        """
        Access to the underlying dataframe.
        """
        return self.dataframe

    def n_pieces(self):
        """
        Returns the number of art pieces in the database.
        """
        return len(self.dataframe)

    def normalize_fields(self):
        """
        Normalize all the fields, e.g. by translating string split by commas into lists.
        """
        self.normalize_field_related_names()
        self.normalize_field_related_dates()
        self.normalize_field_related_places()
        self.normalize_field_related_emotions()

    def normalize_field_related_names(self):
        """
        Translates the string content of the related_names column into lists of names. Firstname and lastname are split.
        In case a name seems too short or too long, a warning is raised and the value is ignored.
        """
        def normalize(row):
            value = row['related_names']
            try:
                names = str_to_list(value, separators=',;\n ')
            except ValueError:
                logging.warning(
                    f"ignoring dubious list of related names \"{value}\" for database_id={row['database_id']}")
                return []

            def is_name(name):
                is_name = len(name) > 2 and len(name) < 16
                if not is_name:
                    logging.warning(
                        f"ignoring dubious related name \"{name}\" for database_id={row['database_id']}")
                return is_name
            return list(filter(is_name, names))
        self.dataframe['related_names'] = self.dataframe.apply(
            normalize, axis=1)

    def normalize_field_related_dates(self):
        """
        Translates the string content of the related_dates column into lists of dates.
        In case a value cannot be translated, a warning is raised and the value is ignored.
        """
        def normalize(row):
            value = row['related_dates']
            try:
                str_dates = str_to_list(value)
            except ValueError:
                logging.warning(
                    f"ignoring dubious list of related dates \"{value}\" for database_id={row['database_id']}")
                return []

            def to_date(str_date):
                try:
                    return str_to_date(str_date)
                except ValueError:
                    logging.warning(
                        f"ignoring dubious related date \"{str_date}\" for database_id={row['database_id']}")
                    return None
            return list(filter(lambda x: x is not None, map(to_date, str_dates)))
        self.dataframe['related_dates'] = self.dataframe.apply(
            normalize, axis=1)

    def normalize_field_related_places(self):
        """
        Translates the string content of the related_places column into lists of places.
        In case a name seems too short or too long, a warning is raised and the value is ignored.
        """
        def normalize(row):
            value = row['related_places']
            try:
                places = str_to_list(value)
            except ValueError:
                logging.warning(
                    f"ignoring dubious list of related places \"{value}\" for database_id={row['database_id']}")
                return []

            def is_place(place):
                is_place = len(place) > 2 and len(place) < 32
                if not is_place:
                    logging.warning(
                        f"ignoring dubious related place \"{place}\" for database_id={row['database_id']}")
                return is_place
            return list(filter(is_place, places))
        self.dataframe['related_places'] = self.dataframe.apply(
            normalize, axis=1)
    def normalize_field_related_emotions(self):
        """
        Translates the string content of the related_emotions column into lists of emotions.
        """
        def normalize(row):
            value = row['related_emotions']
            try:
                emotions = str_to_list(value)
            except ValueError:
                logging.warning(
                    f"ignoring dubious list of related emotions \"{value}\" for database_id={row['database_id']}")
                return []

            # Clean and normalize emotions
            def is_emotion(emotion):
                # Filter out very long strings that are likely not emotions
                is_emotion = len(emotion) > 1 and len(emotion) < 30
                if not is_emotion:
                    logging.warning(
                        f"ignoring dubious related emotion \"{emotion}\" for database_id={row['database_id']}")
                return is_emotion

            # Normalize to lowercase and strip whitespace
            cleaned = []
            for e in emotions:
                normalized = e.lower().strip()
                if is_emotion(normalized):
                    cleaned.append(normalized)
            return cleaned

        self.dataframe['related_emotions'] = self.dataframe.apply(
            normalize, axis=1)