Aurel-test commited on
Commit
c137842
·
verified ·
1 Parent(s): faf0f15

Persist logs in HF Dataset

Browse files
Files changed (1) hide show
  1. app.py +1573 -0
app.py ADDED
@@ -0,0 +1,1573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Web Demo v2 pour la base de données d'œuvres d'art - Version Sécurisée et Optimisée
4
+ Interface multi-étapes avec matching basé sur prénom, date, ville et émotions
5
+ Optimisé pour les performances avec caching et indexation
6
+ Version sécurisée avec validation des entrées et gestion d'état propre
7
+ """
8
+
9
+ import gradio as gr
10
+ import os
11
+ import sys
12
+ import logging
13
+ from logging.handlers import RotatingFileHandler
14
+ import random
15
+ import re
16
+ import json
17
+ import uuid
18
+ import time
19
+ from datetime import datetime
20
+ from typing import List, Dict, Tuple, Optional, Any, Set
21
+ from collections import Counter, defaultdict
22
+ from functools import lru_cache
23
+ from dataclasses import dataclass, field, asdict
24
+ from pathlib import Path
25
+ import pandas as pd
26
+
27
+ # Configuration du logging principal
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="[%(asctime)s] %(levelname)s: %(message)s",
31
+ datefmt="%Y-%m-%d %H:%M:%S",
32
+ )
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Import pour la sauvegarde persistante sur HF Spaces
36
+ try:
37
+ from huggingface_hub import CommitScheduler
38
+ HF_HUB_AVAILABLE = True
39
+ except ImportError:
40
+ HF_HUB_AVAILABLE = False
41
+ logger.warning("huggingface_hub non installé - Les logs ne seront pas sauvegardés dans un dataset HF")
42
+
43
+ # Configuration du logging des sessions
44
+ SESSION_LOG_FILE = "session_logs.jsonl"
45
+ STATS_LOG_FILE = "statistics.json"
46
+
47
+ # Configuration du dataset HF pour la persistance (modifiez ces valeurs)
48
+ HF_DATASET_ID = os.environ.get("HF_DATASET_ID", "ClickMons/art-matcher-logs") # Remplacez par votre dataset
49
+ HF_TOKEN = os.environ.get("HF_TOKEN", None) # Token HF pour l'authentification
50
+ LOGS_UPLOAD_INTERVAL = 10 # Upload toutes les 10 minutes
51
+
52
+ # Créer un handler pour le fichier de logs des sessions (local)
53
+ if not os.path.exists("logs"):
54
+ os.makedirs("logs")
55
+
56
+ session_file_handler = RotatingFileHandler(
57
+ filename=os.path.join("logs", SESSION_LOG_FILE),
58
+ maxBytes=10*1024*1024, # 10MB
59
+ backupCount=5,
60
+ encoding='utf-8'
61
+ )
62
+ session_file_handler.setLevel(logging.INFO)
63
+ session_logger = logging.getLogger('session_logger')
64
+ session_logger.addHandler(session_file_handler)
65
+ session_logger.setLevel(logging.INFO)
66
+
67
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
68
+
69
+ from art_pieces_db.database import Database
70
+ from art_pieces_db.query import TargetProfile, WeightedLeximaxOptimizer, Optimizer
71
+ from art_pieces_db.emotions import EmotionWheel
72
+ from art_pieces_db.utils import str_to_date
73
+
74
+
75
+ @dataclass
76
+ class ScoringWeights:
77
+ """Centralise toutes les constantes de scoring pour éviter les magic numbers"""
78
+
79
+ PRESELECTION_NAME_WEIGHT: float = 3.0
80
+ PRESELECTION_DATE_WEIGHT: float = 1.0
81
+ PRESELECTION_PLACE_WEIGHT: float = 2.0
82
+ PRESELECTION_EMOTION_WEIGHT: float = 0.0
83
+
84
+ MIN_PRESELECTION_COUNT: int = 20
85
+ MAX_IMAGES_PER_SELECTION: int = 3 # nombre d'images par sélection
86
+ TOTAL_ROUNDS: int = 3 # nombre de rounds avant la recommandation finale
87
+
88
+
89
+ @dataclass
90
+ class SessionState:
91
+ """Gère l'état de session"""
92
+
93
+ firstname: str = ""
94
+ birthday: str = ""
95
+ city: str = ""
96
+
97
+ current_round: int = 0
98
+ selected_images: List[str] = field(default_factory=list)
99
+ current_image_ids: List[str] = field(default_factory=list)
100
+
101
+ preselected_pieces: Optional[pd.DataFrame] = None
102
+
103
+ # Nouvelles propriétés pour le tracking
104
+ session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
105
+ session_start_time: float = field(default_factory=time.time)
106
+ recommendation_type: str = "" # "name_date_place" ou "emotions"
107
+ final_artwork: str = ""
108
+
109
+ def reset(self):
110
+ """Réinitialise l'état de session"""
111
+ self.firstname = ""
112
+ self.birthday = ""
113
+ self.city = ""
114
+ self.current_round = 0
115
+ self.selected_images = []
116
+ self.current_image_ids = []
117
+ self.preselected_pieces = None
118
+ self.session_id = str(uuid.uuid4())
119
+ self.session_start_time = time.time()
120
+ self.recommendation_type = ""
121
+ self.final_artwork = ""
122
+
123
+ def is_complete(self) -> bool:
124
+ """Vérifie si la sélection est complète"""
125
+ return self.current_round >= ScoringWeights.TOTAL_ROUNDS
126
+
127
+
128
+ class SessionLogger:
129
+ """Gère le logging des sessions et les statistiques avec persistance HF"""
130
+
131
+ def __init__(self):
132
+ # Détection de l'environnement
133
+ is_hf_space = os.environ.get('SPACE_ID') or os.environ.get('SPACE_HOST')
134
+
135
+ # Configuration du répertoire de logs
136
+ if is_hf_space and HF_HUB_AVAILABLE and HF_TOKEN:
137
+ # Sur HF Spaces avec huggingface_hub installé
138
+ self.logs_dir = Path("hf_logs_data")
139
+ self.logs_dir.mkdir(exist_ok=True)
140
+
141
+ # Initialiser le CommitScheduler pour la sauvegarde automatique
142
+ try:
143
+ self.scheduler = CommitScheduler(
144
+ repo_id=HF_DATASET_ID,
145
+ repo_type="dataset",
146
+ folder_path=self.logs_dir,
147
+ path_in_repo="logs",
148
+ every=LOGS_UPLOAD_INTERVAL,
149
+ token=HF_TOKEN
150
+ )
151
+ logger.info(f"CommitScheduler initialisé - Sauvegarde dans {HF_DATASET_ID} toutes les {LOGS_UPLOAD_INTERVAL} minutes")
152
+ self.use_hf_dataset = True
153
+ except Exception as e:
154
+ logger.error(f"Impossible d'initialiser CommitScheduler: {e}")
155
+ self.scheduler = None
156
+ self.use_hf_dataset = False
157
+ else:
158
+ # Développement local ou pas de configuration HF
159
+ self.logs_dir = Path("logs")
160
+ self.logs_dir.mkdir(exist_ok=True)
161
+ self.scheduler = None
162
+ self.use_hf_dataset = False
163
+
164
+ if is_hf_space:
165
+ logger.warning("Sur HF Spaces mais CommitScheduler non configuré - Les logs seront éphémères")
166
+ logger.info("Pour activer la persistance, configurez HF_DATASET_ID et HF_TOKEN dans les secrets du Space")
167
+
168
+ # Chemins des fichiers de logs
169
+ self.session_log_path = self.logs_dir / SESSION_LOG_FILE
170
+ self.stats_log_path = self.logs_dir / STATS_LOG_FILE
171
+
172
+ # Créer des fichiers uniques pour chaque instance si on utilise HF Dataset
173
+ if self.use_hf_dataset:
174
+ # Utiliser des fichiers datés pour éviter les conflits
175
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
176
+ self.session_log_path = self.logs_dir / f"sessions_{timestamp}.jsonl"
177
+ self.stats_log_path = self.logs_dir / "statistics_latest.json"
178
+
179
+ def log_session(self, state: SessionState, recommendation_system: str):
180
+ """Enregistre les données d'une session terminée"""
181
+ session_duration = time.time() - state.session_start_time
182
+
183
+ session_data = {
184
+ "session_id": state.session_id,
185
+ "timestamp": datetime.now().isoformat(),
186
+ "duration_seconds": round(session_duration, 2),
187
+ "recommended_artwork": state.final_artwork,
188
+ "recommendation_type": recommendation_system
189
+ }
190
+
191
+ # Écrire dans le fichier de logs des sessions
192
+ try:
193
+ # Si on utilise CommitScheduler, utiliser le lock pour la thread safety
194
+ if self.scheduler:
195
+ with self.scheduler.lock:
196
+ with open(self.session_log_path, 'a', encoding='utf-8') as f:
197
+ f.write(json.dumps(session_data, ensure_ascii=False) + '\n')
198
+ else:
199
+ # Sauvegarde locale simple
200
+ with open(self.session_log_path, 'a', encoding='utf-8') as f:
201
+ f.write(json.dumps(session_data, ensure_ascii=False) + '\n')
202
+
203
+ logger.info(f"Session {state.session_id} logged successfully")
204
+ session_logger.info(json.dumps(session_data, ensure_ascii=False))
205
+
206
+ # Mettre à jour les statistiques globales
207
+ self.update_statistics(session_data)
208
+
209
+ if self.use_hf_dataset:
210
+ logger.info(f"Session sauvegardée - Upload automatique vers {HF_DATASET_ID} dans max {LOGS_UPLOAD_INTERVAL} minutes")
211
+
212
+ except Exception as e:
213
+ logger.error(f"Error logging session: {e}")
214
+
215
+ def update_statistics(self, session_data: dict):
216
+ """Met à jour les statistiques globales"""
217
+ try:
218
+ # Charger les statistiques existantes
219
+ if os.path.exists(self.stats_log_path):
220
+ with open(self.stats_log_path, 'r', encoding='utf-8') as f:
221
+ stats = json.load(f)
222
+ else:
223
+ stats = {
224
+ "total_sessions": 0,
225
+ "total_duration_seconds": 0,
226
+ "average_duration_seconds": 0,
227
+ "recommendation_systems_usage": {
228
+ "name_date_place": 0,
229
+ "emotions": 0
230
+ },
231
+ "artworks_recommended": {},
232
+ "last_updated": None
233
+ }
234
+
235
+ # Mettre à jour les statistiques
236
+ stats["total_sessions"] += 1
237
+ stats["total_duration_seconds"] += session_data["duration_seconds"]
238
+ stats["average_duration_seconds"] = stats["total_duration_seconds"] / stats["total_sessions"]
239
+
240
+ # Compter l'utilisation des systèmes de recommandation
241
+ rec_type = session_data["recommendation_type"]
242
+ if rec_type in stats["recommendation_systems_usage"]:
243
+ stats["recommendation_systems_usage"][rec_type] += 1
244
+
245
+ # Compter les œuvres recommandées
246
+ artwork = session_data["recommended_artwork"]
247
+ if artwork:
248
+ if artwork not in stats["artworks_recommended"]:
249
+ stats["artworks_recommended"][artwork] = 0
250
+ stats["artworks_recommended"][artwork] += 1
251
+
252
+ # Trouver l'œuvre la plus recommandée
253
+ if stats["artworks_recommended"]:
254
+ most_recommended = max(stats["artworks_recommended"].items(), key=lambda x: x[1])
255
+ stats["most_recommended_artwork"] = {
256
+ "title": most_recommended[0],
257
+ "count": most_recommended[1]
258
+ }
259
+
260
+ # Calculer l'utilité de chaque système (pourcentage d'utilisation)
261
+ total_recs = sum(stats["recommendation_systems_usage"].values())
262
+ if total_recs > 0:
263
+ stats["system_utility_percentage"] = {
264
+ system: (count / total_recs * 100)
265
+ for system, count in stats["recommendation_systems_usage"].items()
266
+ }
267
+
268
+ stats["last_updated"] = datetime.now().isoformat()
269
+
270
+ # Sauvegarder les statistiques mises à jour
271
+ if self.scheduler:
272
+ with self.scheduler.lock:
273
+ with open(self.stats_log_path, 'w', encoding='utf-8') as f:
274
+ json.dump(stats, f, indent=2, ensure_ascii=False)
275
+ else:
276
+ with open(self.stats_log_path, 'w', encoding='utf-8') as f:
277
+ json.dump(stats, f, indent=2, ensure_ascii=False)
278
+
279
+ logger.info("Global statistics updated")
280
+
281
+ except Exception as e:
282
+ logger.error(f"Error updating statistics: {e}")
283
+
284
+ def get_statistics(self) -> dict:
285
+ """Retourne les statistiques globales"""
286
+ try:
287
+ if os.path.exists(self.stats_log_path):
288
+ with open(self.stats_log_path, 'r', encoding='utf-8') as f:
289
+ return json.load(f)
290
+ return {}
291
+ except Exception as e:
292
+ logger.error(f"Error reading statistics: {e}")
293
+ return {}
294
+
295
+
296
+ # Initialiser le logger de sessions
297
+ session_tracker = SessionLogger()
298
+
299
+
300
+ class SecurityValidator:
301
+ """Classe pour centraliser les validations de sécurité"""
302
+
303
+ PATH_TRAVERSAL_PATTERN = re.compile(r"\.\.|\.\/")
304
+ VALID_FILENAME_PATTERN = re.compile(r"^[\w\-\.\s]+$")
305
+ VALID_INPUT_PATTERN = re.compile(
306
+ r"^[\w\-\s\'\.,àâäéèêëïîôûùüÿæœçÀÂÄÉÈÊËÏÎÔÛÙÜŸÆŒÇ]+$", re.UNICODE
307
+ )
308
+ DATE_PATTERN = re.compile(r"^\d{1,2}/\d{1,2}$")
309
+
310
+ @classmethod
311
+ def validate_filename(cls, filename: str) -> bool:
312
+ """Valide qu'un nom de fichier est sécurisé"""
313
+ if not filename:
314
+ return False
315
+
316
+ # Vérifier les tentatives de path traversal
317
+ if cls.PATH_TRAVERSAL_PATTERN.search(filename):
318
+ logger.warning(f"Tentative de path traversal détectée: {filename}")
319
+ return False
320
+
321
+ # Vérifier que le nom ne contient que des caractères autorisés
322
+ base_name = os.path.basename(filename)
323
+ if not cls.VALID_FILENAME_PATTERN.match(base_name):
324
+ logger.warning(f"Nom de fichier invalide: {filename}")
325
+ return False
326
+
327
+ return True
328
+
329
+ @classmethod
330
+ def sanitize_input(cls, input_str: str, max_length: int = 100) -> str:
331
+ """Nettoie et valide une entrée utilisateur"""
332
+ if not input_str:
333
+ return ""
334
+
335
+ # Tronquer si trop long
336
+ input_str = input_str[:max_length].strip()
337
+
338
+ if not cls.VALID_INPUT_PATTERN.match(input_str):
339
+ # Garder seulement les caractères valides
340
+ cleaned = "".join(c for c in input_str if cls.VALID_INPUT_PATTERN.match(c))
341
+ logger.info(f"Input sanitized: '{input_str}' -> '{cleaned}'")
342
+ return cleaned
343
+
344
+ return input_str
345
+
346
+ @classmethod
347
+ def validate_date(cls, date_str: str) -> Tuple[bool, Optional[datetime]]:
348
+ """Valide et parse une date au format JJ/MM"""
349
+ if not date_str:
350
+ return False, None
351
+
352
+ if not cls.DATE_PATTERN.match(date_str):
353
+ return False, None
354
+
355
+ try:
356
+ day, month = map(int, date_str.split("/"))
357
+ if not (1 <= day <= 31 and 1 <= month <= 12):
358
+ return False, None
359
+
360
+ date_obj = datetime(year=2000, month=month, day=day)
361
+ return True, date_obj
362
+ except (ValueError, Exception) as e:
363
+ logger.error(f"Erreur de parsing de date: {e}")
364
+ return False, None
365
+
366
+
367
+ class ImageIndexer:
368
+ """Classe pour indexer et mapper les images depuis la base de données CSV"""
369
+
370
+ # Constants for better maintainability
371
+ IMAGE_EXTENSIONS = (".jpg", ".png")
372
+ COMMON_SUFFIXES = [".jpg", ".png", "_medium"]
373
+ MAR_BVM_TEST_SUFFIXES = ["-001", "-002", "-003"]
374
+
375
+ def __init__(self, images_dir: str):
376
+ self.images_dir = os.path.abspath(images_dir)
377
+ self.available_files = set()
378
+ self.image_lookup = {} # normalized_name -> filename
379
+ self.mar_bvm_lookup = {} # Special handling for MAR-BVM files
380
+ self._build_index()
381
+
382
+ def _strip_file_extensions(self, filename: str) -> str:
383
+ """Remove file extensions from filename"""
384
+ base_name = filename.lower()
385
+ if base_name.endswith("_medium.jpg"):
386
+ return base_name[:-11]
387
+ elif base_name.endswith((".jpg", ".png")):
388
+ return base_name[:-4]
389
+ return base_name
390
+
391
+ def _normalize_basic_patterns(self, name: str) -> str:
392
+ """Apply basic normalization patterns"""
393
+ # Remove trailing comma and normalize whitespace
394
+ normalized = name.lower().strip().rstrip(",")
395
+
396
+ # Remove common suffixes
397
+ for suffix in self.COMMON_SUFFIXES:
398
+ if normalized.endswith(suffix):
399
+ normalized = normalized[: -len(suffix)]
400
+
401
+ # Normalize spaces and underscores to dashes
402
+ return re.sub(r"[\s_]+", "-", normalized)
403
+
404
+ def _normalize_mar_bvm_format(self, name: str) -> str:
405
+ """Handle MAR-BVM specific normalization"""
406
+ if "mar-bvm" not in name:
407
+ return name
408
+
409
+ # Replace .0. with -0- and remaining dots with dashes
410
+ return name.replace(".0.", "-0-").replace(".", "-")
411
+
412
+ def _normalize_name(self, name: str) -> str:
413
+ """Normalise un nom pour la comparaison"""
414
+ normalized = self._normalize_basic_patterns(name)
415
+
416
+ # Special handling for MAR-BVM format
417
+ if "mar-bvm" in normalized:
418
+ normalized = self._normalize_mar_bvm_format(normalized)
419
+ # For files starting with year (like 2022.0.86), keep dots
420
+ elif not normalized.startswith("20"):
421
+ normalized = normalized.replace(".", "-")
422
+
423
+ return normalized
424
+
425
+ def _create_mar_bvm_lookups(self, normalized: str, filename: str):
426
+ """Create additional lookup entries for MAR-BVM files"""
427
+ if "mar-bvm" not in normalized:
428
+ return
429
+
430
+ parts = normalized.split("-")
431
+ for i, part in enumerate(parts):
432
+ if part.isdigit() and i >= 5: # After mar-bvm-7-2022-0
433
+ base_key = "-".join(parts[:6]) # mar-bvm-7-2022-0-22
434
+ if base_key not in self.mar_bvm_lookup:
435
+ self.mar_bvm_lookup[base_key] = []
436
+ self.mar_bvm_lookup[base_key].append(filename)
437
+ break
438
+
439
+ def _process_image_file(self, filename: str):
440
+ """Process a single image file for indexing"""
441
+ if not SecurityValidator.validate_filename(filename):
442
+ logger.warning(f"Fichier ignoré pour raison de sécurité: {filename}")
443
+ return
444
+
445
+ if not filename.lower().endswith(self.IMAGE_EXTENSIONS):
446
+ return
447
+
448
+ self.available_files.add(filename)
449
+
450
+ base_name = self._strip_file_extensions(filename)
451
+ normalized = self._normalize_name(base_name)
452
+ self.image_lookup[normalized] = filename
453
+ self._create_mar_bvm_lookups(normalized, filename)
454
+
455
+ def _build_index(self):
456
+ """Construit un index des images disponibles"""
457
+ try:
458
+ all_files = os.listdir(self.images_dir)
459
+ for filename in all_files:
460
+ self._process_image_file(filename)
461
+
462
+ logger.info(
463
+ f"Index des images construit: {len(self.available_files)} fichiers disponibles, "
464
+ f"{len(self.image_lookup)} entrées normalisées"
465
+ )
466
+ except Exception as e:
467
+ logger.error(f"Erreur lors de la construction de l'index: {e}")
468
+ self.available_files = set()
469
+
470
+ def _clean_input_name(self, image_name: str) -> str:
471
+ """Clean and prepare input name for processing"""
472
+ # Basic cleaning
473
+ cleaned = image_name.strip().rstrip(",").rstrip("-").strip()
474
+ # Remove spaces before -001, -002, etc.
475
+ return re.sub(r"\s+(-\d)", r"\1", cleaned)
476
+
477
+ def _normalize_mar_bvm_input(self, image_name: str) -> str:
478
+ """Handle MAR-BVM specific input normalization"""
479
+ if "MAR-BVM" not in image_name:
480
+ return image_name
481
+
482
+ # Handle missing "7-" in MAR-BVM-2022-0-153
483
+ if "MAR-BVM-2022-0-" in image_name:
484
+ image_name = image_name.replace("MAR-BVM-2022-0-", "MAR-BVM-7-2022-0-")
485
+
486
+ # Convert .0. to -0-
487
+ if ".0." in image_name:
488
+ image_name = image_name.replace(".0.", "-0-")
489
+
490
+ # Handle .001, .002 at the end (convert to -001, -002)
491
+ image_name = re.sub(r"\.(\d{3})$", r"-\1", image_name)
492
+
493
+ # Handle .1 or .2 suffix
494
+ if image_name.endswith(".1"):
495
+ image_name = image_name[:-2] + "-1"
496
+ elif image_name.endswith(".2"):
497
+ image_name = image_name[:-2] + "-2"
498
+
499
+ # Replace any remaining dots with dashes (but be careful not to mess up already processed parts)
500
+ return image_name.replace(".", "-")
501
+
502
+ def _try_mar_bvm_lookups(self, normalized: str) -> Optional[str]:
503
+ """Try various MAR-BVM specific lookup strategies"""
504
+ # Check special MAR-BVM lookup
505
+ if normalized in self.mar_bvm_lookup and self.mar_bvm_lookup[normalized]:
506
+ return self.mar_bvm_lookup[normalized][0]
507
+
508
+ # Try with suffix variations
509
+ for suffix in self.MAR_BVM_TEST_SUFFIXES:
510
+ test_pattern = f"{normalized}{suffix}"
511
+ if test_pattern in self.image_lookup:
512
+ return self.image_lookup[test_pattern]
513
+
514
+ return None
515
+
516
+ def _try_year_format_lookup(self, image_name: str) -> Optional[str]:
517
+ """Handle special case for files starting with year"""
518
+ if not image_name.startswith("20"):
519
+ return None
520
+
521
+ test_name = image_name.lower().replace(" ", "-")
522
+ return self.image_lookup.get(test_name)
523
+
524
+ def _try_partial_matching(self, normalized: str) -> Optional[str]:
525
+ """Try partial matching as last resort"""
526
+ for key, filename in self.image_lookup.items():
527
+ if key.startswith(normalized) or normalized in key:
528
+ return filename
529
+ return None
530
+
531
+ def _split_multiple_names(self, image_name: str) -> List[str]:
532
+ """Split image names that contain multiple names separated by commas or slashes"""
533
+ # First try comma separation
534
+ if "," in image_name:
535
+ return [name.strip() for name in image_name.split(",") if name.strip()]
536
+
537
+ # Then try slash separation
538
+ if "/" in image_name:
539
+ return [name.strip() for name in image_name.split("/") if name.strip()]
540
+
541
+ # Handle " - " separation (for cases like "MAR-BVM-7-2022.0.81 - 2022.0.81")
542
+ if " - " in image_name and image_name.count(" - ") == 1:
543
+ parts = [name.strip() for name in image_name.split(" - ")]
544
+ # Only use the first part if they look like duplicates
545
+ if len(parts) == 2:
546
+ first, second = parts
547
+ # Check if second part is a suffix of the first (like duplicate year)
548
+ if first.endswith(second) or second in first:
549
+ return [first]
550
+ return parts
551
+
552
+ return [image_name]
553
+
554
+ def find_image(self, image_name: str) -> Optional[str]:
555
+ """Trouve un fichier image correspondant au nom donné"""
556
+ if not image_name:
557
+ return None
558
+
559
+ # Handle multiple image names in one field
560
+ possible_names = self._split_multiple_names(image_name)
561
+
562
+ # Try each name individually
563
+ for name in possible_names:
564
+ result = self._find_single_image(name)
565
+ if result:
566
+ return result
567
+
568
+ return None
569
+
570
+ def _find_single_image(self, image_name: str) -> Optional[str]:
571
+ """Find a single image by name"""
572
+ # Clean and normalize the input
573
+ cleaned_name = self._clean_input_name(image_name)
574
+ processed_name = self._normalize_mar_bvm_input(cleaned_name)
575
+ normalized = self._normalize_name(processed_name)
576
+
577
+ # Try direct lookup first
578
+ if normalized in self.image_lookup:
579
+ return self.image_lookup[normalized]
580
+
581
+ # Try MAR-BVM specific lookups
582
+ if "mar-bvm" in normalized:
583
+ result = self._try_mar_bvm_lookups(normalized)
584
+ if result:
585
+ return result
586
+
587
+ # Try year format lookup
588
+ result = self._try_year_format_lookup(image_name)
589
+ if result:
590
+ return result
591
+
592
+ # Try partial matching as last resort
593
+ return self._try_partial_matching(normalized)
594
+
595
+ def get_all_files(self) -> Set[str]:
596
+ """Retourne tous les fichiers disponibles"""
597
+ return self.available_files.copy()
598
+
599
+
600
+ class ArtMatcherV2:
601
+ """Classe principale pour le matching d'œuvres d'art"""
602
+
603
+ def __init__(self, csv_path: str, images_dir: str):
604
+ """Initialise le système avec la base de données et le répertoire d'images"""
605
+ self.db = Database(csv_path)
606
+ self.images_dir = os.path.abspath(images_dir)
607
+ self.emotion_wheel = EmotionWheel()
608
+ self.weights = ScoringWeights()
609
+
610
+ self.optimizer_helper = WeightedLeximaxOptimizer(TargetProfile(), {})
611
+
612
+ self.image_indexer = ImageIndexer(images_dir)
613
+
614
+ df = self.db.get_dataframe()
615
+ self.df_with_images = df[
616
+ df["name_image"].notna()
617
+ & (df["name_image"] != "")
618
+ & (df["name_image"].str.strip() != "")
619
+ ].copy()
620
+
621
+ self.df_with_images["database_id_str"] = self.df_with_images[
622
+ "database_id"
623
+ ].astype(str)
624
+ self.id_to_index = {
625
+ str(row["database_id"]): idx for idx, row in self.df_with_images.iterrows()
626
+ }
627
+
628
+ self.artwork_images = self._build_artwork_image_index()
629
+
630
+ self.temp_db_with_images = Database.__new__(Database)
631
+ self.temp_db_with_images.dataframe = self.df_with_images
632
+
633
+ logger.info(f"Base de données chargée: {self.db.n_pieces()} œuvres")
634
+ logger.info(f"Œuvres avec images: {len(self.df_with_images)}")
635
+ logger.info(f"Index des images: {len(self.artwork_images)} œuvres mappées")
636
+
637
+ def _sanitize_input(self, input_str: str) -> str:
638
+ """Nettoie et valide une entrée utilisateur"""
639
+ return SecurityValidator.sanitize_input(input_str)
640
+
641
+ def _parse_date(self, date_str: str) -> Optional[datetime]:
642
+ """Parse une date avec validation"""
643
+ is_valid, date_obj = SecurityValidator.validate_date(date_str)
644
+ return date_obj if is_valid else None
645
+
646
+ def _build_artwork_image_index(self) -> Dict[str, List[str]]:
647
+ """Construit un index artwork_id -> [image_paths] au démarrage"""
648
+ artwork_images = {}
649
+
650
+ for idx, row in self.df_with_images.iterrows():
651
+ artwork_id = str(row["database_id"])
652
+ image_paths = []
653
+
654
+ if row["name_image"] and str(row["name_image"]).strip():
655
+ # Parse the image names - handle special separators
656
+ image_string = str(row["name_image"]).strip().strip('"')
657
+
658
+ # Handle cases with " / " or " - " separators
659
+ if " / " in image_string:
660
+ # Take first part before the slash
661
+ image_string = image_string.split(" / ")[0].strip()
662
+
663
+ # Special case: if it has " - 2022" it's a separator, not part of the name
664
+ if " - 2022" in image_string:
665
+ # Take the part before " - 2022"
666
+ image_string = image_string.split(" - 2022")[0].strip()
667
+ elif " - " in image_string and "MAR-BVM-7-2022-0-" not in image_string:
668
+ # For other MAR-BVM formats with " - " separator
669
+ parts = image_string.split(" - ")
670
+ if "MAR-BVM" in parts[0]:
671
+ image_string = parts[0].strip()
672
+
673
+ # Clean up trailing " -" or spaces before "-001"
674
+ image_string = re.sub(
675
+ r"\s+-\s*$", "", image_string
676
+ ) # Remove trailing " -"
677
+ image_string = re.sub(
678
+ r"\s+(-\d)", r"\1", image_string
679
+ ) # Remove spaces before -001
680
+
681
+ # Parse comma-separated list
682
+ images = [
683
+ img.strip()
684
+ for img in re.split(r"[,/]", image_string)
685
+ if img.strip()
686
+ ]
687
+
688
+ for img_name in images:
689
+ # Find the actual file for this image name
690
+ matched_file = self.image_indexer.find_image(img_name)
691
+ if matched_file:
692
+ img_path = os.path.join(self.images_dir, matched_file)
693
+ image_paths.append(img_path)
694
+
695
+ if image_paths:
696
+ artwork_images[artwork_id] = image_paths
697
+
698
+ return artwork_images
699
+
700
+ def preselect_artworks(
701
+ self, firstname: str, birthday: str, city: str
702
+ ) -> pd.DataFrame:
703
+ """
704
+ Pré-sélectionne les œuvres selon la hiérarchie: prénom > date > ville
705
+ """
706
+ logger.info("=== DÉBUT PRÉ-SÉLECTION ===")
707
+
708
+ # Nettoyer les entrées
709
+ firstname = self._sanitize_input(firstname)
710
+ city = self._sanitize_input(city)
711
+
712
+ logger.info(
713
+ f"Critères de pré-sélection: prénom='{firstname}', date='{birthday}', ville='{city}'"
714
+ )
715
+
716
+ birth_date = self._parse_date(birthday)
717
+ if birth_date:
718
+ logger.info(f"Date convertie: {birth_date.strftime('%d/%m')}")
719
+
720
+ profile = TargetProfile()
721
+ profile.set_target_name(firstname)
722
+ profile.set_target_date(birth_date)
723
+ profile.set_target_place(city)
724
+
725
+ weights = {
726
+ "related_names": self.weights.PRESELECTION_NAME_WEIGHT,
727
+ "related_dates": self.weights.PRESELECTION_DATE_WEIGHT,
728
+ "related_places": self.weights.PRESELECTION_PLACE_WEIGHT,
729
+ "related_emotions": self.weights.PRESELECTION_EMOTION_WEIGHT,
730
+ }
731
+
732
+ logger.info(
733
+ f"Poids utilisés: nom={weights['related_names']}, date={weights['related_dates']}, lieu={weights['related_places']}, émotions={weights['related_emotions']}"
734
+ )
735
+
736
+ optimizer = WeightedLeximaxOptimizer(profile, weights)
737
+ result = optimizer.optimize_max(self.temp_db_with_images)
738
+
739
+ preselected = result[result["score"] > (0, 0, 0)]
740
+ logger.info(f"Œuvres avec score > 0: {len(preselected)}")
741
+
742
+ if len(preselected) < self.weights.MIN_PRESELECTION_COUNT:
743
+ preselected = result.head(self.weights.MIN_PRESELECTION_COUNT)
744
+ logger.info(f"Ajustement au minimum requis: {len(preselected)} œuvres")
745
+
746
+ logger.info("Top 5 pré-sélections:")
747
+ for i, (idx, piece) in enumerate(preselected.head(5).iterrows()):
748
+ logger.info(
749
+ f" {i+1}. Œuvre #{piece['database_id']} - Score: {piece['score']}"
750
+ )
751
+ if firstname and piece["related_names"]:
752
+ name_score = Optimizer.name_similarity(
753
+ firstname, piece["related_names"]
754
+ )
755
+ if name_score > 0:
756
+ logger.info(
757
+ f" → Nom: {piece['related_names']} (score: {name_score:.2f})"
758
+ )
759
+ if birth_date and piece["related_dates"]:
760
+ date_score = Optimizer.date_similarity(
761
+ birth_date, piece["related_dates"]
762
+ )
763
+ if date_score > 0:
764
+ logger.info(
765
+ f" → Dates: {[d.strftime('%d/%m') for d in piece['related_dates']]} (score: {date_score:.2f})"
766
+ )
767
+ if city and piece["related_places"]:
768
+ place_score = self.optimizer_helper.place_similarity(
769
+ city, piece["related_places"]
770
+ )
771
+ if place_score > 0:
772
+ logger.info(
773
+ f" → Lieux: {piece['related_places']} (score: {place_score:.2f})"
774
+ )
775
+
776
+ logger.info("=== FIN PRÉ-SÉLECTION ===")
777
+ return preselected
778
+
779
+ def get_random_images_for_selection(
780
+ self, round_num: int, already_selected: List[str] = None
781
+ ) -> List[Tuple[str, str]]:
782
+ """
783
+ Retourne 3 images aléatoires depuis l'index pré-construit
784
+ Exclut les œuvres déjà sélectionnées dans les tours précédents
785
+ """
786
+ logger.info(f"=== SÉLECTION D'IMAGES POUR LE TOUR {round_num} ===")
787
+
788
+ if already_selected:
789
+ logger.info(f"Œuvres déjà sélectionnées à exclure: {already_selected}")
790
+
791
+ available_artworks = list(self.artwork_images.keys())
792
+
793
+ # Exclure les œuvres déjà sélectionnées
794
+ if already_selected:
795
+ already_selected_set = set(already_selected)
796
+ available_artworks = [
797
+ a for a in available_artworks if a not in already_selected_set
798
+ ]
799
+
800
+ logger.info(
801
+ f"Nombre total d'œuvres avec images disponibles: {len(available_artworks)}"
802
+ )
803
+
804
+ if len(available_artworks) < self.weights.MAX_IMAGES_PER_SELECTION:
805
+ logger.warning(
806
+ f"Seulement {len(available_artworks)} œuvres avec images disponibles"
807
+ )
808
+ direct_images = []
809
+ for filename in list(self.image_indexer.get_all_files())[:10]:
810
+ if filename.endswith(".jpg"):
811
+ img_path = os.path.join(self.images_dir, filename)
812
+ direct_images.append((img_path, "0"))
813
+ return direct_images[: self.weights.MAX_IMAGES_PER_SELECTION]
814
+
815
+ num_to_select = min(
816
+ self.weights.MAX_IMAGES_PER_SELECTION, len(available_artworks)
817
+ )
818
+ selected_artworks = random.sample(available_artworks, num_to_select)
819
+
820
+ logger.info(f"Œuvres sélectionnées aléatoirement: {selected_artworks}")
821
+
822
+ selected = []
823
+ for artwork_id in selected_artworks:
824
+ img_path = random.choice(self.artwork_images[artwork_id])
825
+ selected.append((img_path, artwork_id))
826
+ if artwork_id in self.id_to_index:
827
+ idx = self.id_to_index[artwork_id]
828
+ artwork = self.df_with_images.loc[idx]
829
+ logger.info(f" Image {len(selected)}: Œuvre #{artwork_id}")
830
+ logger.info(f" Type: {artwork['art_piece_type']}")
831
+ logger.info(f" Émotions: {artwork['related_emotions']}")
832
+
833
+ logger.info(f"=== FIN SÉLECTION IMAGES TOUR {round_num} ===")
834
+ return selected
835
+
836
+ def extract_emotions_from_image_id(self, database_id: str) -> List[str]:
837
+ """
838
+ Extrait les émotions associées à une œuvre via son ID
839
+ Utilise l'index pré-calculé pour éviter les conversions répétées
840
+ """
841
+ if database_id in self.id_to_index:
842
+ idx = self.id_to_index[database_id]
843
+ emotions = self.df_with_images.loc[idx, "related_emotions"]
844
+ if isinstance(emotions, list):
845
+ return emotions
846
+ return []
847
+
848
+ @lru_cache(maxsize=1024)
849
+ def _cached_emotion_similarity(self, emotion1: str, emotion2: str) -> float:
850
+ """Cache les calculs de similarité émotionnelle"""
851
+ return self.emotion_wheel.calculate_emotion_similarity(emotion1, emotion2)
852
+
853
+ def calculate_emotion_profile(self, selected_ids: List[str]) -> Dict[str, float]:
854
+ """
855
+ Calcule le profil émotionnel basé sur les images sélectionnées
856
+ """
857
+ logger.info("=== CALCUL DU PROFIL ÉMOTIONNEL ===")
858
+ logger.info(f"Images sélectionnées: {selected_ids}")
859
+
860
+ emotion_counter = Counter()
861
+
862
+ for db_id in selected_ids:
863
+ emotions = self.extract_emotions_from_image_id(db_id)
864
+ logger.info(f" Image {db_id}: émotions = {emotions}")
865
+ emotion_counter.update(emotions)
866
+
867
+ total = sum(emotion_counter.values())
868
+ if total > 0:
869
+ emotion_profile = {
870
+ emotion: count / total for emotion, count in emotion_counter.items()
871
+ }
872
+ logger.info(f"Profil émotionnel calculé: {emotion_profile}")
873
+ else:
874
+ emotion_profile = {}
875
+ logger.info("Aucune émotion trouvée dans les images sélectionnées")
876
+
877
+ logger.info("=== FIN CALCUL PROFIL ÉMOTIONNEL ===")
878
+ return emotion_profile
879
+
880
+ def _get_artwork_image(self, artwork) -> Optional[str]:
881
+ """Retourne le chemin de l'image pour une œuvre d'art"""
882
+ artwork_id = str(artwork["database_id"])
883
+
884
+ # Simply return the first image from our pre-built index
885
+ if artwork_id in self.artwork_images:
886
+ return self.artwork_images[artwork_id][0]
887
+
888
+ return None
889
+
890
+ def find_best_match(
891
+ self, firstname: str, birthday: str, city: str, selected_image_ids: List[str]
892
+ ) -> Tuple[Optional[str], str, Dict]:
893
+ """
894
+ Trouve la meilleure correspondance selon la hiérarchie du scénario:
895
+ 1. Match exact (name/date/city) = gagnant automatique
896
+ 2. Si pré-sélection existe: utiliser émotions pour départager
897
+ 3. Si aucune pré-sélection: utiliser émotions seules
898
+ 4. Type d'objet comme critère de départage final
899
+ """
900
+ firstname = self._sanitize_input(firstname)
901
+ city = self._sanitize_input(city)
902
+ birth_date = self._parse_date(birthday)
903
+
904
+ logger.info(
905
+ f"Recherche de correspondance pour: {firstname}, {birthday}, {city}"
906
+ )
907
+
908
+ preselected = self.preselect_artworks(firstname, birthday, city)
909
+
910
+ logger.info("=== DÉTECTION DE MATCH EXACT ===")
911
+ for idx, piece in preselected.iterrows():
912
+ if firstname and piece["related_names"]:
913
+ name_score = Optimizer.name_similarity(
914
+ firstname, piece["related_names"]
915
+ )
916
+ if name_score >= 0.95:
917
+ logger.info(
918
+ f"🎯 MATCH EXACT TROUVÉ: prénom '{firstname}' → œuvre #{piece['database_id']} (score: {name_score:.2f})"
919
+ )
920
+ logger.info(f" Noms dans l'œuvre: {piece['related_names']}")
921
+ match_image = self._get_artwork_image(piece)
922
+ match_info = {
923
+ "title": f"Œuvre #{piece['database_id']}",
924
+ "type": piece["art_piece_type"],
925
+ "place": piece["art_piece_place"],
926
+ "emotions": piece["related_emotions"],
927
+ "explanation": piece["explanation"],
928
+ }
929
+ return (
930
+ match_image,
931
+ f"Prénom '{firstname}' correspond exactement",
932
+ match_info,
933
+ )
934
+
935
+ if birth_date and piece["related_dates"]:
936
+ date_score = Optimizer.date_similarity(
937
+ birth_date, piece["related_dates"]
938
+ )
939
+ if date_score == 1.0:
940
+ logger.info(
941
+ f"🎯 MATCH EXACT TROUVÉ: date '{birthday}' → œuvre #{piece['database_id']}"
942
+ )
943
+ logger.info(
944
+ f" Dates dans l'œuvre: {[d.strftime('%d/%m/%Y') for d in piece['related_dates']]}"
945
+ )
946
+ match_image = self._get_artwork_image(piece)
947
+ match_info = {
948
+ "title": f"Œuvre #{piece['database_id']}",
949
+ "type": piece["art_piece_type"],
950
+ "place": piece["art_piece_place"],
951
+ "emotions": piece["related_emotions"],
952
+ "explanation": piece["explanation"],
953
+ }
954
+ return (
955
+ match_image,
956
+ f"Date d'anniversaire {birthday} correspond exactement",
957
+ match_info,
958
+ )
959
+
960
+ if city and piece["related_places"]:
961
+ place_score = self.optimizer_helper.place_similarity(
962
+ city, piece["related_places"]
963
+ )
964
+ if place_score == 1.0:
965
+ logger.info(
966
+ f"🎯 MATCH EXACT TROUVÉ: ville '{city}' → œuvre #{piece['database_id']}"
967
+ )
968
+ logger.info(f" Lieux dans l'œuvre: {piece['related_places']}")
969
+ match_image = self._get_artwork_image(piece)
970
+ match_info = {
971
+ "title": f"Œuvre #{piece['database_id']}",
972
+ "type": piece["art_piece_type"],
973
+ "place": piece["art_piece_place"],
974
+ "emotions": piece["related_emotions"],
975
+ "explanation": piece["explanation"],
976
+ }
977
+ return (
978
+ match_image,
979
+ f"Ville '{city}' correspond exactement",
980
+ match_info,
981
+ )
982
+
983
+ logger.info("Aucun match exact trouvé, passage à la sélection par émotions")
984
+
985
+ emotion_profile = self.calculate_emotion_profile(selected_image_ids)
986
+
987
+ logger.info("=== STRATÉGIE DE MATCHING ===")
988
+ valid_preselection = preselected[preselected["score"] > (0, 0, 0)]
989
+
990
+ if len(valid_preselection) > 0:
991
+ logger.info(
992
+ f"📋 CAS A: {len(valid_preselection)} œuvres pré-sélectionnées - utilisation des émotions pour départager"
993
+ )
994
+ candidates = valid_preselection
995
+ else:
996
+ logger.info(
997
+ f"📋 CAS B: Aucune pré-sélection valide - recherche par émotions sur {len(self.df_with_images)} œuvres"
998
+ )
999
+ candidates = self.df_with_images
1000
+
1001
+ # Exclure les œuvres déjà sélectionnées par l'utilisateur
1002
+ selected_artwork_ids = set(selected_image_ids)
1003
+ candidates = candidates[
1004
+ ~candidates["database_id"].astype(str).isin(selected_artwork_ids)
1005
+ ]
1006
+ logger.info(
1007
+ f"Après exclusion des œuvres déjà sélectionnées {selected_artwork_ids}: {len(candidates)} candidats restants"
1008
+ )
1009
+
1010
+ logger.info("=== CALCUL DES SCORES ÉMOTIONNELS ===")
1011
+ best_matches = []
1012
+ best_emotion_score = -1
1013
+
1014
+ for idx, piece in candidates.iterrows():
1015
+ emotion_score = 0
1016
+
1017
+ if emotion_profile and piece["related_emotions"]:
1018
+ for user_emotion, weight in emotion_profile.items():
1019
+ best_similarity = 0
1020
+ for piece_emotion in piece["related_emotions"]:
1021
+ similarity = self._cached_emotion_similarity(
1022
+ user_emotion, piece_emotion
1023
+ )
1024
+ if similarity > best_similarity:
1025
+ best_similarity = similarity
1026
+ emotion_score += best_similarity * weight
1027
+
1028
+ if len(piece["related_emotions"]) > 0:
1029
+ emotion_score /= len(piece["related_emotions"])
1030
+
1031
+ if emotion_score > best_emotion_score:
1032
+ best_emotion_score = emotion_score
1033
+ best_matches = [piece]
1034
+ logger.info(
1035
+ f" Nouveau meilleur score émotionnel: {emotion_score:.3f} - Œuvre #{piece['database_id']}"
1036
+ )
1037
+ elif emotion_score == best_emotion_score and emotion_score > 0:
1038
+ best_matches.append(piece)
1039
+ logger.info(
1040
+ f" Score égal au meilleur: {emotion_score:.3f} - Œuvre #{piece['database_id']}"
1041
+ )
1042
+
1043
+ logger.info(
1044
+ f"Nombre de meilleures correspondances: {len(best_matches)} avec score {best_emotion_score:.3f}"
1045
+ )
1046
+
1047
+ if len(best_matches) > 1:
1048
+ logger.info("=== DÉPARTAGE PAR TYPE D'OBJET ===")
1049
+ selected_types = []
1050
+ for img_id in selected_image_ids:
1051
+ if img_id in self.id_to_index:
1052
+ idx = self.id_to_index[img_id]
1053
+ selected_types.append(
1054
+ self.df_with_images.loc[idx, "art_piece_type"]
1055
+ )
1056
+
1057
+ selected_types_counter = Counter(selected_types)
1058
+
1059
+ type_scored_matches = []
1060
+ best_type_score = -1
1061
+
1062
+ for piece in best_matches:
1063
+ type_score = selected_types_counter.get(piece["art_piece_type"], 0)
1064
+ if type_score > best_type_score:
1065
+ best_type_score = type_score
1066
+ type_scored_matches = [piece]
1067
+ elif type_score == best_type_score:
1068
+ type_scored_matches.append(piece)
1069
+
1070
+ if len(type_scored_matches) > 1:
1071
+ logger.info(
1072
+ f" {len(type_scored_matches)} œuvres avec le même score de type ({best_type_score}) - sélection aléatoire"
1073
+ )
1074
+ best_match = random.choice(type_scored_matches)
1075
+ match_reason = (
1076
+ "Sélection aléatoire parmi les meilleures correspondances"
1077
+ )
1078
+ else:
1079
+ best_match = type_scored_matches[0]
1080
+ match_reason = f"Type d'objet '{best_match['art_piece_type']}' préféré"
1081
+ logger.info(
1082
+ f" Type '{best_match['art_piece_type']}' sélectionné avec score {best_type_score}"
1083
+ )
1084
+ elif len(best_matches) == 1:
1085
+ best_match = best_matches[0]
1086
+ match_reason = "Meilleure correspondance émotionnelle"
1087
+ else:
1088
+ logger.info("Aucune correspondance trouvée")
1089
+ return None, "Aucune correspondance trouvée", {}
1090
+
1091
+ reasons = []
1092
+ if len(valid_preselection) > 0:
1093
+ if firstname and best_match["related_names"]:
1094
+ name_score = Optimizer.name_similarity(
1095
+ firstname, best_match["related_names"]
1096
+ )
1097
+ if name_score > 0:
1098
+ reasons.append(f"prénom '{firstname}' trouvé")
1099
+
1100
+ if birth_date and best_match["related_dates"]:
1101
+ date_score = Optimizer.date_similarity(
1102
+ birth_date, best_match["related_dates"]
1103
+ )
1104
+ if date_score > 0:
1105
+ reasons.append(
1106
+ f"date {'exacte' if date_score == 1.0 else 'partielle'}"
1107
+ )
1108
+
1109
+ if city and best_match["related_places"]:
1110
+ place_score = self.optimizer_helper.place_similarity(
1111
+ city, best_match["related_places"]
1112
+ )
1113
+ if place_score > 0:
1114
+ reasons.append(f"ville '{city}' trouvée")
1115
+
1116
+ if best_emotion_score > 0:
1117
+ reasons.append(
1118
+ f"correspondance émotionnelle (score: {best_emotion_score:.2f})"
1119
+ )
1120
+
1121
+ if len(reasons) == 0:
1122
+ reasons.append(match_reason)
1123
+
1124
+ final_reason = " ; ".join(reasons)
1125
+
1126
+ logger.info(f"\n🏆 RÉSULTAT FINAL: Œuvre #{best_match['database_id']}")
1127
+ logger.info(f" Raison: {final_reason}")
1128
+ logger.info(f" Type: {best_match['art_piece_type']}")
1129
+ logger.info(f" Lieu: {best_match['art_piece_place']}")
1130
+
1131
+ match_image = self._get_artwork_image(best_match)
1132
+
1133
+ match_info = {
1134
+ "title": f"Œuvre #{best_match['database_id']}",
1135
+ "type": best_match["art_piece_type"],
1136
+ "place": best_match["art_piece_place"],
1137
+ "emotions": best_match["related_emotions"],
1138
+ "explanation": best_match["explanation"],
1139
+ }
1140
+
1141
+ return match_image, final_reason, match_info
1142
+
1143
+
1144
+ csv_path = "PP1-Collection_Database_new-cleaned.csv"
1145
+ images_dir = "pictures_data"
1146
+
1147
+ if not os.path.exists(csv_path):
1148
+ logger.error(f"Fichier CSV introuvable: {csv_path}")
1149
+ if not os.path.exists(images_dir):
1150
+ logger.error(f"Répertoire images introuvable: {images_dir}")
1151
+
1152
+ matcher = ArtMatcherV2(csv_path, images_dir)
1153
+
1154
+
1155
+ def process_user_info(firstname: str, birthday: str, city: str, state: SessionState):
1156
+ """Traite les informations utilisateur avec validation"""
1157
+ firstname = SecurityValidator.sanitize_input(firstname)
1158
+ city = SecurityValidator.sanitize_input(city)
1159
+
1160
+ state.firstname = firstname
1161
+ state.birthday = birthday
1162
+ state.city = city
1163
+
1164
+ if not firstname or not birthday:
1165
+ return (
1166
+ gr.update(visible=True),
1167
+ gr.update(visible=False),
1168
+ gr.update(visible=False),
1169
+ "Veuillez remplir au moins votre prénom et date de naissance.",
1170
+ state,
1171
+ )
1172
+
1173
+ is_valid, _ = SecurityValidator.validate_date(birthday)
1174
+ if not is_valid:
1175
+ return (
1176
+ gr.update(visible=True),
1177
+ gr.update(visible=False),
1178
+ gr.update(visible=False),
1179
+ "Format de date invalide. Utilisez JJ/MM (ex: 15/03)",
1180
+ state,
1181
+ )
1182
+
1183
+ return (
1184
+ gr.update(visible=False),
1185
+ gr.update(visible=True),
1186
+ gr.update(visible=False),
1187
+ "Informations enregistrées ! Passons à la sélection d'images.",
1188
+ state,
1189
+ )
1190
+
1191
+
1192
+ def load_images_for_round(round_num: int, state: SessionState):
1193
+ """Charge 3 images pour un tour de sélection"""
1194
+ images_data = matcher.get_random_images_for_selection(
1195
+ round_num, state.selected_images
1196
+ )
1197
+
1198
+ if len(images_data) < ScoringWeights.MAX_IMAGES_PER_SELECTION:
1199
+ logger.warning(f"Seulement {len(images_data)} images disponibles")
1200
+ return (
1201
+ [None, None, None],
1202
+ [],
1203
+ f"Pas assez d'images disponibles (seulement {len(images_data)} trouvées)",
1204
+ state,
1205
+ )
1206
+
1207
+ images = [img[0] for img in images_data]
1208
+ ids = [img[1] for img in images_data]
1209
+
1210
+ state.current_image_ids = ids
1211
+
1212
+ return (
1213
+ images,
1214
+ ids,
1215
+ f"Tour {round_num + 1}/{ScoringWeights.TOTAL_ROUNDS} : Sélectionnez l'image qui vous attire le plus",
1216
+ state,
1217
+ )
1218
+
1219
+
1220
+ def select_image(choice: Optional[int], state: SessionState):
1221
+ """Traite la sélection d'image"""
1222
+ if choice is None:
1223
+ return (
1224
+ gr.update(),
1225
+ gr.update(),
1226
+ gr.update(),
1227
+ gr.update(),
1228
+ "Veuillez sélectionner une image",
1229
+ state,
1230
+ )
1231
+
1232
+ if state.current_image_ids and len(state.current_image_ids) > choice:
1233
+ selected_id = state.current_image_ids[choice]
1234
+ else:
1235
+ return (
1236
+ gr.update(),
1237
+ gr.update(),
1238
+ gr.update(),
1239
+ gr.update(),
1240
+ "Erreur: image non trouvée",
1241
+ state,
1242
+ )
1243
+
1244
+ state.selected_images.append(selected_id)
1245
+ state.current_round += 1
1246
+
1247
+ logger.info(
1248
+ f"Tour {state.current_round}: Image {choice+1} sélectionnée (ID: {selected_id})"
1249
+ )
1250
+
1251
+ if state.current_round < ScoringWeights.TOTAL_ROUNDS:
1252
+ new_images, new_ids, message, state = load_images_for_round(
1253
+ state.current_round, state
1254
+ )
1255
+ return (
1256
+ gr.update(value=new_images[0]),
1257
+ gr.update(value=new_images[1]),
1258
+ gr.update(value=new_images[2]),
1259
+ gr.update(value=None),
1260
+ message,
1261
+ state,
1262
+ gr.update(visible=True), # keep selection_section visible
1263
+ gr.update(visible=False), # keep loading_section hidden
1264
+ )
1265
+ else:
1266
+ # Toutes les sélections sont terminées, afficher le loading
1267
+ return (
1268
+ gr.update(), # img1
1269
+ gr.update(), # img2
1270
+ gr.update(), # img3
1271
+ gr.update(), # image_choice
1272
+ "", # status_message vide
1273
+ state,
1274
+ gr.update(visible=False), # hide selection_section
1275
+ gr.update(visible=True), # show loading_section
1276
+ )
1277
+
1278
+
1279
+ def show_results(state: SessionState):
1280
+ """Affiche les résultats finaux"""
1281
+ if not state.is_complete():
1282
+ return (
1283
+ gr.update(visible=False), # info_section
1284
+ gr.update(visible=True), # selection_section
1285
+ gr.update(visible=False), # loading_section
1286
+ gr.update(visible=False), # results_section
1287
+ None,
1288
+ "",
1289
+ "",
1290
+ )
1291
+
1292
+ match_image, reason, info = matcher.find_best_match(
1293
+ state.firstname,
1294
+ state.birthday,
1295
+ state.city,
1296
+ state.selected_images,
1297
+ )
1298
+
1299
+ if match_image:
1300
+ # Déterminer le type de système de recommandation utilisé
1301
+ if "correspond exactement" in reason.lower():
1302
+ # Match exact sur nom, date ou lieu
1303
+ recommendation_type = "name_date_place"
1304
+ else:
1305
+ # Match basé sur les émotions
1306
+ recommendation_type = "emotions"
1307
+
1308
+ # Enregistrer l'œuvre finale et le type de recommandation
1309
+ state.final_artwork = info.get("title", "Œuvre inconnue")
1310
+ state.recommendation_type = recommendation_type
1311
+
1312
+ # Logger la session
1313
+ session_tracker.log_session(state, recommendation_type)
1314
+
1315
+ explanation = f"""
1316
+ **Votre œuvre correspondante a été trouvée !**
1317
+
1318
+ **Raison du match :** {reason}
1319
+
1320
+ **Détails de l'œuvre :**
1321
+ - Type : {info.get('type', 'Non spécifié')}
1322
+ - Lieu : {info.get('place', 'Non spécifié')}
1323
+ - Émotions : {', '.join(info.get('emotions', [])) if info.get('emotions') else 'Non spécifiées'}
1324
+
1325
+ **Description :**
1326
+ {info.get('explanation', 'Aucune description disponible')}
1327
+ """
1328
+ else:
1329
+ # Aucune œuvre trouvée - logger quand même
1330
+ state.final_artwork = "Aucune œuvre trouvée"
1331
+ state.recommendation_type = "none"
1332
+ session_tracker.log_session(state, "none")
1333
+
1334
+ explanation = "Désolé, aucune œuvre correspondante n'a pu être trouvée."
1335
+
1336
+ return (
1337
+ gr.update(visible=False), # info_section
1338
+ gr.update(visible=False), # selection_section
1339
+ gr.update(visible=False), # loading_section
1340
+ gr.update(visible=True), # results_section
1341
+ match_image,
1342
+ info.get("title", "Œuvre non trouvée") if match_image else "Œuvre non trouvée",
1343
+ explanation,
1344
+ )
1345
+
1346
+
1347
+ with gr.Blocks(
1348
+ title="Art Matcher",
1349
+ theme=gr.themes.Soft(
1350
+ primary_hue="teal", secondary_hue="teal", neutral_hue="zinc"
1351
+ ),
1352
+ ) as demo:
1353
+ gr.Markdown(
1354
+ """
1355
+ # 🎨 Art Matcher
1356
+ ### Découvrez l'œuvre d'art qui vous correspond !
1357
+
1358
+ Cette application utilise vos informations personnelles et vos préférences visuelles
1359
+ pour trouver l'œuvre d'art qui vous correspond le mieux dans notre collection.
1360
+ """
1361
+ )
1362
+
1363
+ session_state = gr.State(SessionState())
1364
+
1365
+ with gr.Group(visible=True) as info_section:
1366
+ gr.Markdown("### Étape 1 : Vos informations")
1367
+ with gr.Row():
1368
+ firstname_input = gr.Textbox(
1369
+ label="Prénom", placeholder="Entrez votre prénom", max_lines=1
1370
+ )
1371
+ birthday_input = gr.Textbox(
1372
+ label="Date d'anniversaire (JJ/MM)",
1373
+ placeholder="Ex: 25/12",
1374
+ max_lines=1,
1375
+ )
1376
+ city_input = gr.Textbox(
1377
+ label="Ville de résidence", placeholder="Ex: Paris", max_lines=1
1378
+ )
1379
+
1380
+ submit_info_btn = gr.Button("Valider mes informations", variant="primary")
1381
+
1382
+ with gr.Group(visible=False) as selection_section:
1383
+ selection_title = gr.Markdown("### Étape 2 : Sélection d'images")
1384
+
1385
+ with gr.Row():
1386
+ img1 = gr.Image(label="Image 1", type="filepath", height=300)
1387
+ img2 = gr.Image(label="Image 2", type="filepath", height=300)
1388
+ img3 = gr.Image(label="Image 3", type="filepath", height=300)
1389
+
1390
+ image_choice = gr.Radio(
1391
+ choices=["Image 1", "Image 2", "Image 3"],
1392
+ label="Quelle image vous attire le plus ?",
1393
+ type="index",
1394
+ )
1395
+
1396
+ select_btn = gr.Button("Valider mon choix", variant="primary")
1397
+
1398
+ with gr.Group(visible=False) as loading_section:
1399
+ gr.Markdown("### ⏳ Analyse en cours...")
1400
+ gr.HTML("""
1401
+ <div style="text-align: center; padding: 40px;">
1402
+ <div style="display: inline-block; width: 60px; height: 60px; border: 6px solid #f3f3f3; border-top: 6px solid #14b8a6; border-radius: 50%; animation: spin 1s linear infinite;"></div>
1403
+ <style>
1404
+ @keyframes spin {
1405
+ 0% { transform: rotate(0deg); }
1406
+ 100% { transform: rotate(360deg); }
1407
+ }
1408
+ </style>
1409
+ <p style="margin-top: 20px; font-size: 18px; color: #666;">
1410
+ <strong>Traitement de vos sélections émotionnelles...</strong><br>
1411
+ <span style="font-size: 14px;">Nous analysons votre profil pour trouver l'œuvre parfaite</span>
1412
+ </p>
1413
+ </div>
1414
+ """)
1415
+
1416
+ with gr.Group(visible=False) as results_section:
1417
+ gr.Markdown("### Votre œuvre correspondante")
1418
+
1419
+ with gr.Row():
1420
+ with gr.Column(scale=1):
1421
+ result_image = gr.Image(label="Votre œuvre", height=400)
1422
+ result_title = gr.Markdown("## Titre de l'œuvre")
1423
+
1424
+ with gr.Column(scale=1):
1425
+ result_explanation = gr.Markdown("")
1426
+
1427
+ restart_btn = gr.Button("Recommencer", variant="secondary")
1428
+
1429
+ status_message = gr.Markdown("")
1430
+
1431
+ def on_info_submit(firstname, birthday, city, state):
1432
+ state.reset()
1433
+
1434
+ info_vis, select_vis, results_vis, message, state = process_user_info(
1435
+ firstname, birthday, city, state
1436
+ )
1437
+
1438
+ if select_vis["visible"]:
1439
+ images, ids, round_message, state = load_images_for_round(0, state)
1440
+ return (
1441
+ info_vis,
1442
+ select_vis,
1443
+ results_vis,
1444
+ images[0] if len(images) > 0 else None,
1445
+ images[1] if len(images) > 1 else None,
1446
+ images[2] if len(images) > 2 else None,
1447
+ round_message,
1448
+ state,
1449
+ )
1450
+ else:
1451
+ return (info_vis, select_vis, results_vis, None, None, None, message, state)
1452
+
1453
+ submit_info_btn.click(
1454
+ fn=on_info_submit,
1455
+ inputs=[firstname_input, birthday_input, city_input, session_state],
1456
+ outputs=[
1457
+ info_section,
1458
+ selection_section,
1459
+ results_section,
1460
+ img1,
1461
+ img2,
1462
+ img3,
1463
+ status_message,
1464
+ session_state,
1465
+ ],
1466
+ )
1467
+
1468
+ def on_image_select(choice, state):
1469
+ result = select_image(choice, state)
1470
+
1471
+ # La fonction select_image retourne maintenant 8 valeurs
1472
+ if len(result) == 8:
1473
+ (img1_update, img2_update, img3_update, choice_update, message, state,
1474
+ selection_vis, loading_vis) = result
1475
+ return (
1476
+ gr.update(), # info_section
1477
+ selection_vis, # selection_section
1478
+ loading_vis, # loading_section
1479
+ gr.update(), # results_section
1480
+ img1_update, # img1
1481
+ img2_update, # img2
1482
+ img3_update, # img3
1483
+ choice_update, # image_choice
1484
+ message, # status_message
1485
+ state,
1486
+ )
1487
+ else:
1488
+ # Format avec 6 valeurs (cas sans loading)
1489
+ (img1_update, img2_update, img3_update, choice_update, message, state) = result
1490
+ return (
1491
+ gr.update(), # info_section
1492
+ gr.update(), # selection_section
1493
+ gr.update(), # loading_section
1494
+ gr.update(), # results_section
1495
+ img1_update, # img1
1496
+ img2_update, # img2
1497
+ img3_update, # img3
1498
+ choice_update, # image_choice
1499
+ message, # status_message
1500
+ state,
1501
+ )
1502
+
1503
+ def handle_final_results(state):
1504
+ if state.is_complete():
1505
+ return show_results(state)
1506
+ else:
1507
+ return gr.update(), gr.update(), gr.update(), gr.update(), None, "", ""
1508
+
1509
+ select_btn.click(
1510
+ fn=on_image_select,
1511
+ inputs=[image_choice, session_state],
1512
+ outputs=[
1513
+ info_section,
1514
+ selection_section,
1515
+ loading_section,
1516
+ results_section,
1517
+ img1,
1518
+ img2,
1519
+ img3,
1520
+ image_choice,
1521
+ status_message,
1522
+ session_state,
1523
+ ],
1524
+ ).then(
1525
+ fn=handle_final_results,
1526
+ inputs=[session_state],
1527
+ outputs=[
1528
+ info_section,
1529
+ selection_section,
1530
+ loading_section,
1531
+ results_section,
1532
+ result_image,
1533
+ result_title,
1534
+ result_explanation,
1535
+ ],
1536
+ )
1537
+
1538
+ def restart_app(state):
1539
+ state.reset()
1540
+
1541
+ return (
1542
+ gr.update(visible=True), # info_section
1543
+ gr.update(visible=False), # selection_section
1544
+ gr.update(visible=False), # loading_section
1545
+ gr.update(visible=False), # results_section
1546
+ "", # firstname_input
1547
+ "", # birthday_input
1548
+ "", # city_input
1549
+ None, # image_choice
1550
+ "Application réinitialisée. Veuillez entrer vos informations.", # status_message
1551
+ state,
1552
+ )
1553
+
1554
+ restart_btn.click(
1555
+ fn=restart_app,
1556
+ inputs=[session_state],
1557
+ outputs=[
1558
+ info_section,
1559
+ selection_section,
1560
+ loading_section,
1561
+ results_section,
1562
+ firstname_input,
1563
+ birthday_input,
1564
+ city_input,
1565
+ image_choice,
1566
+ status_message,
1567
+ session_state,
1568
+ ],
1569
+ )
1570
+
1571
+
1572
+ if __name__ == "__main__":
1573
+ demo.launch()