File size: 5,872 Bytes
c3c0d39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from .utils import str_to_date, str_to_list
import logging
import pandas as pd
import re

COL_NAMES = [
    'database_id',
    'museum_id',
    'art_piece_place',
    'art_piece_type',
    'art_piece_date',
    'related_names',
    'related_dates',
    'related_places',
    'keywords',
    'related_emotions',
    'explanation',
    'question',
    'name_image' # à implémenter
]


class Database:
    """
    A database, backed by Pandas.
    """

    def __init__(self, csv):
        """
        Creates a new database given a CSV file which separator is ";".
        """
        self.dataframe = pd.read_csv(
            csv, sep=";", names=COL_NAMES, usecols=range(0, len(COL_NAMES)), na_filter=False)
        self.normalize_fields()

    def get_dataframe(self):
        """
        Access to the underlying dataframe.
        """
        return self.dataframe

    def n_pieces(self):
        """
        Returns the number of art pieces in the database.
        """
        return len(self.dataframe)

    def normalize_fields(self):
        """
        Normalize all the fields, e.g. by translating string split by commas into lists.
        """
        self.normalize_field_related_names()
        self.normalize_field_related_dates()
        self.normalize_field_related_places()
        self.normalize_field_related_emotions()

    def normalize_field_related_names(self):
        """
        Translates the string content of the related_names column into lists of names. Firstname and lastname are split.
        In case a name seems too short or too long, a warning is raised and the value is ignored.
        """
        def normalize(row):
            value = row['related_names']
            try:
                names = str_to_list(value, separators=',;\n ')
            except ValueError:
                logging.warning(
                    f"ignoring dubious list of related names \"{value}\" for database_id={row['database_id']}")
                return []

            def is_name(name):
                is_name = len(name) > 2 and len(name) < 16
                if not is_name:
                    logging.warning(
                        f"ignoring dubious related name \"{name}\" for database_id={row['database_id']}")
                return is_name
            return list(filter(is_name, names))
        self.dataframe['related_names'] = self.dataframe.apply(
            normalize, axis=1)

    def normalize_field_related_dates(self):
        """
        Translates the string content of the related_dates column into lists of dates.
        In case a value cannot be translated, a warning is raised and the value is ignored.
        """
        def normalize(row):
            value = row['related_dates']
            try:
                str_dates = str_to_list(value)
            except ValueError:
                logging.warning(
                    f"ignoring dubious list of related dates \"{value}\" for database_id={row['database_id']}")
                return []

            def to_date(str_date):
                try:
                    return str_to_date(str_date)
                except ValueError:
                    logging.warning(
                        f"ignoring dubious related date \"{str_date}\" for database_id={row['database_id']}")
                    return None
            return list(filter(lambda x: x is not None, map(to_date, str_dates)))
        self.dataframe['related_dates'] = self.dataframe.apply(
            normalize, axis=1)

    def normalize_field_related_places(self):
        """
        Translates the string content of the related_places column into lists of places.
        In case a name seems too short or too long, a warning is raised and the value is ignored.
        """
        def normalize(row):
            value = row['related_places']
            try:
                places = str_to_list(value)
            except ValueError:
                logging.warning(
                    f"ignoring dubious list of related places \"{value}\" for database_id={row['database_id']}")
                return []

            def is_place(place):
                is_place = len(place) > 2 and len(place) < 32
                if not is_place:
                    logging.warning(
                        f"ignoring dubious related place \"{place}\" for database_id={row['database_id']}")
                return is_place
            return list(filter(is_place, places))
        self.dataframe['related_places'] = self.dataframe.apply(
            normalize, axis=1)
    def normalize_field_related_emotions(self):
        """
        Translates the string content of the related_emotions column into lists of emotions.
        """
        def normalize(row):
            value = row['related_emotions']
            try:
                emotions = str_to_list(value)
            except ValueError:
                logging.warning(
                    f"ignoring dubious list of related emotions \"{value}\" for database_id={row['database_id']}")
                return []

            # Clean and normalize emotions
            def is_emotion(emotion):
                # Filter out very long strings that are likely not emotions
                is_emotion = len(emotion) > 1 and len(emotion) < 30
                if not is_emotion:
                    logging.warning(
                        f"ignoring dubious related emotion \"{emotion}\" for database_id={row['database_id']}")
                return is_emotion

            # Normalize to lowercase and strip whitespace
            cleaned = []
            for e in emotions:
                normalized = e.lower().strip()
                if is_emotion(normalized):
                    cleaned.append(normalized)
            return cleaned

        self.dataframe['related_emotions'] = self.dataframe.apply(
            normalize, axis=1)