Aurel-test's picture
Upload folder using huggingface_hub
c3c0d39 verified
from .utils import str_to_date, str_to_list
import logging
import pandas as pd
import re
COL_NAMES = [
'database_id',
'museum_id',
'art_piece_place',
'art_piece_type',
'art_piece_date',
'related_names',
'related_dates',
'related_places',
'keywords',
'related_emotions',
'explanation',
'question',
'name_image' # à implémenter
]
class Database:
"""
A database, backed by Pandas.
"""
def __init__(self, csv):
"""
Creates a new database given a CSV file which separator is ";".
"""
self.dataframe = pd.read_csv(
csv, sep=";", names=COL_NAMES, usecols=range(0, len(COL_NAMES)), na_filter=False)
self.normalize_fields()
def get_dataframe(self):
"""
Access to the underlying dataframe.
"""
return self.dataframe
def n_pieces(self):
"""
Returns the number of art pieces in the database.
"""
return len(self.dataframe)
def normalize_fields(self):
"""
Normalize all the fields, e.g. by translating string split by commas into lists.
"""
self.normalize_field_related_names()
self.normalize_field_related_dates()
self.normalize_field_related_places()
self.normalize_field_related_emotions()
def normalize_field_related_names(self):
"""
Translates the string content of the related_names column into lists of names. Firstname and lastname are split.
In case a name seems too short or too long, a warning is raised and the value is ignored.
"""
def normalize(row):
value = row['related_names']
try:
names = str_to_list(value, separators=',;\n ')
except ValueError:
logging.warning(
f"ignoring dubious list of related names \"{value}\" for database_id={row['database_id']}")
return []
def is_name(name):
is_name = len(name) > 2 and len(name) < 16
if not is_name:
logging.warning(
f"ignoring dubious related name \"{name}\" for database_id={row['database_id']}")
return is_name
return list(filter(is_name, names))
self.dataframe['related_names'] = self.dataframe.apply(
normalize, axis=1)
def normalize_field_related_dates(self):
"""
Translates the string content of the related_dates column into lists of dates.
In case a value cannot be translated, a warning is raised and the value is ignored.
"""
def normalize(row):
value = row['related_dates']
try:
str_dates = str_to_list(value)
except ValueError:
logging.warning(
f"ignoring dubious list of related dates \"{value}\" for database_id={row['database_id']}")
return []
def to_date(str_date):
try:
return str_to_date(str_date)
except ValueError:
logging.warning(
f"ignoring dubious related date \"{str_date}\" for database_id={row['database_id']}")
return None
return list(filter(lambda x: x is not None, map(to_date, str_dates)))
self.dataframe['related_dates'] = self.dataframe.apply(
normalize, axis=1)
def normalize_field_related_places(self):
"""
Translates the string content of the related_places column into lists of places.
In case a name seems too short or too long, a warning is raised and the value is ignored.
"""
def normalize(row):
value = row['related_places']
try:
places = str_to_list(value)
except ValueError:
logging.warning(
f"ignoring dubious list of related places \"{value}\" for database_id={row['database_id']}")
return []
def is_place(place):
is_place = len(place) > 2 and len(place) < 32
if not is_place:
logging.warning(
f"ignoring dubious related place \"{place}\" for database_id={row['database_id']}")
return is_place
return list(filter(is_place, places))
self.dataframe['related_places'] = self.dataframe.apply(
normalize, axis=1)
def normalize_field_related_emotions(self):
"""
Translates the string content of the related_emotions column into lists of emotions.
"""
def normalize(row):
value = row['related_emotions']
try:
emotions = str_to_list(value)
except ValueError:
logging.warning(
f"ignoring dubious list of related emotions \"{value}\" for database_id={row['database_id']}")
return []
# Clean and normalize emotions
def is_emotion(emotion):
# Filter out very long strings that are likely not emotions
is_emotion = len(emotion) > 1 and len(emotion) < 30
if not is_emotion:
logging.warning(
f"ignoring dubious related emotion \"{emotion}\" for database_id={row['database_id']}")
return is_emotion
# Normalize to lowercase and strip whitespace
cleaned = []
for e in emotions:
normalized = e.lower().strip()
if is_emotion(normalized):
cleaned.append(normalized)
return cleaned
self.dataframe['related_emotions'] = self.dataframe.apply(
normalize, axis=1)