Spaces:
Sleeping
Sleeping
| from .utils import str_to_date, str_to_list | |
| import logging | |
| import pandas as pd | |
| import re | |
| COL_NAMES = [ | |
| 'database_id', | |
| 'museum_id', | |
| 'art_piece_place', | |
| 'art_piece_type', | |
| 'art_piece_date', | |
| 'related_names', | |
| 'related_dates', | |
| 'related_places', | |
| 'keywords', | |
| 'related_emotions', | |
| 'explanation', | |
| 'question', | |
| 'name_image' # à implémenter | |
| ] | |
| class Database: | |
| """ | |
| A database, backed by Pandas. | |
| """ | |
| def __init__(self, csv): | |
| """ | |
| Creates a new database given a CSV file which separator is ";". | |
| """ | |
| self.dataframe = pd.read_csv( | |
| csv, sep=";", names=COL_NAMES, usecols=range(0, len(COL_NAMES)), na_filter=False) | |
| self.normalize_fields() | |
| def get_dataframe(self): | |
| """ | |
| Access to the underlying dataframe. | |
| """ | |
| return self.dataframe | |
| def n_pieces(self): | |
| """ | |
| Returns the number of art pieces in the database. | |
| """ | |
| return len(self.dataframe) | |
| def normalize_fields(self): | |
| """ | |
| Normalize all the fields, e.g. by translating string split by commas into lists. | |
| """ | |
| self.normalize_field_related_names() | |
| self.normalize_field_related_dates() | |
| self.normalize_field_related_places() | |
| self.normalize_field_related_emotions() | |
| def normalize_field_related_names(self): | |
| """ | |
| Translates the string content of the related_names column into lists of names. Firstname and lastname are split. | |
| In case a name seems too short or too long, a warning is raised and the value is ignored. | |
| """ | |
| def normalize(row): | |
| value = row['related_names'] | |
| try: | |
| names = str_to_list(value, separators=',;\n ') | |
| except ValueError: | |
| logging.warning( | |
| f"ignoring dubious list of related names \"{value}\" for database_id={row['database_id']}") | |
| return [] | |
| def is_name(name): | |
| is_name = len(name) > 2 and len(name) < 16 | |
| if not is_name: | |
| logging.warning( | |
| f"ignoring dubious related name \"{name}\" for database_id={row['database_id']}") | |
| return is_name | |
| return list(filter(is_name, names)) | |
| self.dataframe['related_names'] = self.dataframe.apply( | |
| normalize, axis=1) | |
| def normalize_field_related_dates(self): | |
| """ | |
| Translates the string content of the related_dates column into lists of dates. | |
| In case a value cannot be translated, a warning is raised and the value is ignored. | |
| """ | |
| def normalize(row): | |
| value = row['related_dates'] | |
| try: | |
| str_dates = str_to_list(value) | |
| except ValueError: | |
| logging.warning( | |
| f"ignoring dubious list of related dates \"{value}\" for database_id={row['database_id']}") | |
| return [] | |
| def to_date(str_date): | |
| try: | |
| return str_to_date(str_date) | |
| except ValueError: | |
| logging.warning( | |
| f"ignoring dubious related date \"{str_date}\" for database_id={row['database_id']}") | |
| return None | |
| return list(filter(lambda x: x is not None, map(to_date, str_dates))) | |
| self.dataframe['related_dates'] = self.dataframe.apply( | |
| normalize, axis=1) | |
| def normalize_field_related_places(self): | |
| """ | |
| Translates the string content of the related_places column into lists of places. | |
| In case a name seems too short or too long, a warning is raised and the value is ignored. | |
| """ | |
| def normalize(row): | |
| value = row['related_places'] | |
| try: | |
| places = str_to_list(value) | |
| except ValueError: | |
| logging.warning( | |
| f"ignoring dubious list of related places \"{value}\" for database_id={row['database_id']}") | |
| return [] | |
| def is_place(place): | |
| is_place = len(place) > 2 and len(place) < 32 | |
| if not is_place: | |
| logging.warning( | |
| f"ignoring dubious related place \"{place}\" for database_id={row['database_id']}") | |
| return is_place | |
| return list(filter(is_place, places)) | |
| self.dataframe['related_places'] = self.dataframe.apply( | |
| normalize, axis=1) | |
| def normalize_field_related_emotions(self): | |
| """ | |
| Translates the string content of the related_emotions column into lists of emotions. | |
| """ | |
| def normalize(row): | |
| value = row['related_emotions'] | |
| try: | |
| emotions = str_to_list(value) | |
| except ValueError: | |
| logging.warning( | |
| f"ignoring dubious list of related emotions \"{value}\" for database_id={row['database_id']}") | |
| return [] | |
| # Clean and normalize emotions | |
| def is_emotion(emotion): | |
| # Filter out very long strings that are likely not emotions | |
| is_emotion = len(emotion) > 1 and len(emotion) < 30 | |
| if not is_emotion: | |
| logging.warning( | |
| f"ignoring dubious related emotion \"{emotion}\" for database_id={row['database_id']}") | |
| return is_emotion | |
| # Normalize to lowercase and strip whitespace | |
| cleaned = [] | |
| for e in emotions: | |
| normalized = e.lower().strip() | |
| if is_emotion(normalized): | |
| cleaned.append(normalized) | |
| return cleaned | |
| self.dataframe['related_emotions'] = self.dataframe.apply( | |
| normalize, axis=1) | |