Spaces:
Sleeping
Sleeping
File size: 5,872 Bytes
c3c0d39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from .utils import str_to_date, str_to_list
import logging
import pandas as pd
import re
COL_NAMES = [
'database_id',
'museum_id',
'art_piece_place',
'art_piece_type',
'art_piece_date',
'related_names',
'related_dates',
'related_places',
'keywords',
'related_emotions',
'explanation',
'question',
'name_image' # à implémenter
]
class Database:
"""
A database, backed by Pandas.
"""
def __init__(self, csv):
"""
Creates a new database given a CSV file which separator is ";".
"""
self.dataframe = pd.read_csv(
csv, sep=";", names=COL_NAMES, usecols=range(0, len(COL_NAMES)), na_filter=False)
self.normalize_fields()
def get_dataframe(self):
"""
Access to the underlying dataframe.
"""
return self.dataframe
def n_pieces(self):
"""
Returns the number of art pieces in the database.
"""
return len(self.dataframe)
def normalize_fields(self):
"""
Normalize all the fields, e.g. by translating string split by commas into lists.
"""
self.normalize_field_related_names()
self.normalize_field_related_dates()
self.normalize_field_related_places()
self.normalize_field_related_emotions()
def normalize_field_related_names(self):
"""
Translates the string content of the related_names column into lists of names. Firstname and lastname are split.
In case a name seems too short or too long, a warning is raised and the value is ignored.
"""
def normalize(row):
value = row['related_names']
try:
names = str_to_list(value, separators=',;\n ')
except ValueError:
logging.warning(
f"ignoring dubious list of related names \"{value}\" for database_id={row['database_id']}")
return []
def is_name(name):
is_name = len(name) > 2 and len(name) < 16
if not is_name:
logging.warning(
f"ignoring dubious related name \"{name}\" for database_id={row['database_id']}")
return is_name
return list(filter(is_name, names))
self.dataframe['related_names'] = self.dataframe.apply(
normalize, axis=1)
def normalize_field_related_dates(self):
"""
Translates the string content of the related_dates column into lists of dates.
In case a value cannot be translated, a warning is raised and the value is ignored.
"""
def normalize(row):
value = row['related_dates']
try:
str_dates = str_to_list(value)
except ValueError:
logging.warning(
f"ignoring dubious list of related dates \"{value}\" for database_id={row['database_id']}")
return []
def to_date(str_date):
try:
return str_to_date(str_date)
except ValueError:
logging.warning(
f"ignoring dubious related date \"{str_date}\" for database_id={row['database_id']}")
return None
return list(filter(lambda x: x is not None, map(to_date, str_dates)))
self.dataframe['related_dates'] = self.dataframe.apply(
normalize, axis=1)
def normalize_field_related_places(self):
"""
Translates the string content of the related_places column into lists of places.
In case a name seems too short or too long, a warning is raised and the value is ignored.
"""
def normalize(row):
value = row['related_places']
try:
places = str_to_list(value)
except ValueError:
logging.warning(
f"ignoring dubious list of related places \"{value}\" for database_id={row['database_id']}")
return []
def is_place(place):
is_place = len(place) > 2 and len(place) < 32
if not is_place:
logging.warning(
f"ignoring dubious related place \"{place}\" for database_id={row['database_id']}")
return is_place
return list(filter(is_place, places))
self.dataframe['related_places'] = self.dataframe.apply(
normalize, axis=1)
def normalize_field_related_emotions(self):
"""
Translates the string content of the related_emotions column into lists of emotions.
"""
def normalize(row):
value = row['related_emotions']
try:
emotions = str_to_list(value)
except ValueError:
logging.warning(
f"ignoring dubious list of related emotions \"{value}\" for database_id={row['database_id']}")
return []
# Clean and normalize emotions
def is_emotion(emotion):
# Filter out very long strings that are likely not emotions
is_emotion = len(emotion) > 1 and len(emotion) < 30
if not is_emotion:
logging.warning(
f"ignoring dubious related emotion \"{emotion}\" for database_id={row['database_id']}")
return is_emotion
# Normalize to lowercase and strip whitespace
cleaned = []
for e in emotions:
normalized = e.lower().strip()
if is_emotion(normalized):
cleaned.append(normalized)
return cleaned
self.dataframe['related_emotions'] = self.dataframe.apply(
normalize, axis=1)
|