from .utils import str_to_date, str_to_list import logging import pandas as pd import re COL_NAMES = [ 'database_id', 'museum_id', 'art_piece_place', 'art_piece_type', 'art_piece_date', 'related_names', 'related_dates', 'related_places', 'keywords', 'related_emotions', 'explanation', 'question', 'name_image' # à implémenter ] class Database: """ A database, backed by Pandas. """ def __init__(self, csv): """ Creates a new database given a CSV file which separator is ";". """ self.dataframe = pd.read_csv( csv, sep=";", names=COL_NAMES, usecols=range(0, len(COL_NAMES)), na_filter=False) self.normalize_fields() def get_dataframe(self): """ Access to the underlying dataframe. """ return self.dataframe def n_pieces(self): """ Returns the number of art pieces in the database. """ return len(self.dataframe) def normalize_fields(self): """ Normalize all the fields, e.g. by translating string split by commas into lists. """ self.normalize_field_related_names() self.normalize_field_related_dates() self.normalize_field_related_places() self.normalize_field_related_emotions() def normalize_field_related_names(self): """ Translates the string content of the related_names column into lists of names. Firstname and lastname are split. In case a name seems too short or too long, a warning is raised and the value is ignored. """ def normalize(row): value = row['related_names'] try: names = str_to_list(value, separators=',;\n ') except ValueError: logging.warning( f"ignoring dubious list of related names \"{value}\" for database_id={row['database_id']}") return [] def is_name(name): is_name = len(name) > 2 and len(name) < 16 if not is_name: logging.warning( f"ignoring dubious related name \"{name}\" for database_id={row['database_id']}") return is_name return list(filter(is_name, names)) self.dataframe['related_names'] = self.dataframe.apply( normalize, axis=1) def normalize_field_related_dates(self): """ Translates the string content of the related_dates column into lists of dates. In case a value cannot be translated, a warning is raised and the value is ignored. """ def normalize(row): value = row['related_dates'] try: str_dates = str_to_list(value) except ValueError: logging.warning( f"ignoring dubious list of related dates \"{value}\" for database_id={row['database_id']}") return [] def to_date(str_date): try: return str_to_date(str_date) except ValueError: logging.warning( f"ignoring dubious related date \"{str_date}\" for database_id={row['database_id']}") return None return list(filter(lambda x: x is not None, map(to_date, str_dates))) self.dataframe['related_dates'] = self.dataframe.apply( normalize, axis=1) def normalize_field_related_places(self): """ Translates the string content of the related_places column into lists of places. In case a name seems too short or too long, a warning is raised and the value is ignored. """ def normalize(row): value = row['related_places'] try: places = str_to_list(value) except ValueError: logging.warning( f"ignoring dubious list of related places \"{value}\" for database_id={row['database_id']}") return [] def is_place(place): is_place = len(place) > 2 and len(place) < 32 if not is_place: logging.warning( f"ignoring dubious related place \"{place}\" for database_id={row['database_id']}") return is_place return list(filter(is_place, places)) self.dataframe['related_places'] = self.dataframe.apply( normalize, axis=1) def normalize_field_related_emotions(self): """ Translates the string content of the related_emotions column into lists of emotions. """ def normalize(row): value = row['related_emotions'] try: emotions = str_to_list(value) except ValueError: logging.warning( f"ignoring dubious list of related emotions \"{value}\" for database_id={row['database_id']}") return [] # Clean and normalize emotions def is_emotion(emotion): # Filter out very long strings that are likely not emotions is_emotion = len(emotion) > 1 and len(emotion) < 30 if not is_emotion: logging.warning( f"ignoring dubious related emotion \"{emotion}\" for database_id={row['database_id']}") return is_emotion # Normalize to lowercase and strip whitespace cleaned = [] for e in emotions: normalized = e.lower().strip() if is_emotion(normalized): cleaned.append(normalized) return cleaned self.dataframe['related_emotions'] = self.dataframe.apply( normalize, axis=1)