Aurel-test commited on
Commit
3ecc91d
·
verified ·
1 Parent(s): d9459c4

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +1566 -0
app.py ADDED
@@ -0,0 +1,1566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Web Demo v2 pour la base de données d'œuvres d'art - Version Sécurisée et Optimisée
4
+ Interface multi-étapes avec matching basé sur prénom, date, ville et émotions
5
+ Optimisé pour les performances avec caching et indexation
6
+ Version sécurisée avec validation des entrées et gestion d'état propre
7
+ """
8
+
9
+ import gradio as gr
10
+ import os
11
+ import sys
12
+ import logging
13
+ from logging.handlers import RotatingFileHandler
14
+ import random
15
+ import re
16
+ import json
17
+ import uuid
18
+ import time
19
+ from datetime import datetime
20
+ from typing import List, Dict, Tuple, Optional, Any, Set
21
+ from collections import Counter, defaultdict
22
+ from functools import lru_cache
23
+ from dataclasses import dataclass, field, asdict
24
+ from pathlib import Path
25
+ import pandas as pd
26
+
27
+ # Configuration du logging principal
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="[%(asctime)s] %(levelname)s: %(message)s",
31
+ datefmt="%Y-%m-%d %H:%M:%S",
32
+ )
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Import pour la sauvegarde persistante sur HF Spaces
36
+ try:
37
+ from huggingface_hub import CommitScheduler
38
+
39
+ HF_HUB_AVAILABLE = True
40
+ except ImportError:
41
+ HF_HUB_AVAILABLE = False
42
+ logger.warning(
43
+ "huggingface_hub non installé - Les logs ne seront pas sauvegardés dans un dataset HF"
44
+ )
45
+
46
+ # Configuration du logging des sessions
47
+ SESSION_LOG_FILE = "session_logs.jsonl"
48
+ STATS_LOG_FILE = "statistics.json"
49
+
50
+ # Configuration du dataset HF pour la persistance (modifiez ces valeurs)
51
+ HF_DATASET_ID = os.environ.get(
52
+ "HF_DATASET_ID", "ClickMons/art-matcher-logs"
53
+ ) # Remplacez par votre dataset
54
+ HF_TOKEN = os.environ.get("HF_TOKEN", None) # Token HF pour l'authentification
55
+ LOGS_UPLOAD_INTERVAL = 10 # Upload toutes les 10 minutes
56
+
57
+ # Créer un handler pour le fichier de logs des sessions (local)
58
+ if not os.path.exists("logs"):
59
+ os.makedirs("logs")
60
+
61
+ session_file_handler = RotatingFileHandler(
62
+ filename=os.path.join("logs", SESSION_LOG_FILE),
63
+ maxBytes=10 * 1024 * 1024, # 10MB
64
+ backupCount=5,
65
+ encoding="utf-8",
66
+ )
67
+ session_file_handler.setLevel(logging.INFO)
68
+ session_logger = logging.getLogger("session_logger")
69
+ session_logger.addHandler(session_file_handler)
70
+ session_logger.setLevel(logging.INFO)
71
+
72
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
73
+
74
+ from art_pieces_db.database import Database
75
+ from art_pieces_db.query import TargetProfile, WeightedLeximaxOptimizer, Optimizer
76
+ from art_pieces_db.emotions import EmotionWheel
77
+ from art_pieces_db.utils import str_to_date
78
+
79
+
80
+ @dataclass
81
+ class ScoringWeights:
82
+ """Centralise toutes les constantes de scoring pour éviter les magic numbers"""
83
+
84
+ PRESELECTION_NAME_WEIGHT: float = 3.0
85
+ PRESELECTION_DATE_WEIGHT: float = 1.0
86
+ PRESELECTION_PLACE_WEIGHT: float = 2.0
87
+ PRESELECTION_EMOTION_WEIGHT: float = 0.0
88
+
89
+ MIN_PRESELECTION_COUNT: int = 20
90
+ MAX_IMAGES_PER_SELECTION: int = 3 # nombre d'images par sélection
91
+ TOTAL_ROUNDS: int = 3 # nombre de rounds avant la recommandation finale
92
+
93
+
94
+ @dataclass
95
+ class SessionState:
96
+ """Gère l'état de session"""
97
+
98
+ firstname: str = ""
99
+ birthday: str = ""
100
+ city: str = ""
101
+
102
+ current_round: int = 0
103
+ selected_images: List[str] = field(default_factory=list)
104
+ current_image_ids: List[str] = field(default_factory=list)
105
+
106
+ preselected_pieces: Optional[pd.DataFrame] = None
107
+
108
+ # Propriétés pour le tracking
109
+ session_start_time: float = field(default_factory=time.time)
110
+ recommendation_type: str = "" # "name_date_place" ou "emotions"
111
+ final_artwork: str = ""
112
+
113
+ def reset(self):
114
+ """Réinitialise l'état de session"""
115
+ self.firstname = ""
116
+ self.birthday = ""
117
+ self.city = ""
118
+ self.current_round = 0
119
+ self.selected_images = []
120
+ self.current_image_ids = []
121
+ self.preselected_pieces = None
122
+ self.session_start_time = time.time()
123
+ self.recommendation_type = ""
124
+ self.final_artwork = ""
125
+
126
+ def is_complete(self) -> bool:
127
+ """Vérifie si la sélection est complète"""
128
+ return self.current_round >= ScoringWeights.TOTAL_ROUNDS
129
+
130
+
131
+ class SessionLogger:
132
+ """Version améliorée du logger de sessions avec CommitScheduler simplifié"""
133
+
134
+ def __init__(self):
135
+ # Détection de l'environnement HF Spaces
136
+ self.is_hf_space = os.environ.get("SPACE_ID") is not None
137
+
138
+ # Configuration du répertoire de données
139
+ self.data_dir = Path("art_matcher_data")
140
+ self.data_dir.mkdir(parents=True, exist_ok=True)
141
+
142
+ # Un seul fichier unique par instance du Space (comme Wauplin)
143
+ self.instance_id = uuid.uuid4()
144
+ self.sessions_file = self.data_dir / f"sessions_{self.instance_id}.jsonl"
145
+ self.stats_file = self.data_dir / "global_statistics.json"
146
+
147
+ # Initialiser le CommitScheduler si sur HF Spaces
148
+ self.scheduler = None
149
+ if self.is_hf_space and HF_HUB_AVAILABLE:
150
+ try:
151
+ # Configuration simplifiée - pas besoin de HF_TOKEN explicite sur Spaces
152
+ self.scheduler = CommitScheduler(
153
+ repo_id=HF_DATASET_ID, # Utilise la variable existante
154
+ repo_type="dataset",
155
+ folder_path=self.data_dir,
156
+ path_in_repo="data",
157
+ every=LOGS_UPLOAD_INTERVAL, # Utilise la variable existante
158
+ )
159
+ logger.info(
160
+ f"✅ CommitScheduler initialisé - Instance ID: {self.instance_id}"
161
+ )
162
+ logger.info(
163
+ f"Les données seront automatiquement sauvegardées dans {HF_DATASET_ID}"
164
+ )
165
+ except Exception as e:
166
+ logger.warning(f"⚠️ CommitScheduler non disponible: {e}")
167
+ logger.info("Les données seront stockées localement uniquement")
168
+
169
+ def log_session(self, state: SessionState, recommendation_system: str):
170
+ """Enregistre une session de manière thread-safe"""
171
+ session_duration = time.time() - state.session_start_time
172
+
173
+ entry = {
174
+ "session_id": str(self.instance_id),
175
+ "datetime": datetime.now().isoformat(),
176
+ "duration_seconds": round(session_duration, 2),
177
+ "recommended_artwork": state.final_artwork,
178
+ "recommendation_type": recommendation_system,
179
+ }
180
+
181
+ # Utiliser le lock du scheduler pour la thread safety
182
+ if self.scheduler:
183
+ with self.scheduler.lock:
184
+ self._write_session(entry)
185
+ self._update_stats(entry)
186
+ else:
187
+ # Sans scheduler, écriture directe
188
+ self._write_session(entry)
189
+ self._update_stats(entry)
190
+
191
+ logger.info(f"Session enregistrée - Durée: {entry['duration_seconds']}s")
192
+ session_logger.info(json.dumps(entry, ensure_ascii=False))
193
+
194
+ def _write_session(self, entry: dict):
195
+ """Écrit une entrée de session dans le fichier JSONL"""
196
+ with self.sessions_file.open("a", encoding="utf-8") as f:
197
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
198
+
199
+ def _update_stats(self, session_entry: dict):
200
+ """Met à jour les statistiques globales"""
201
+ # Charger les stats existantes
202
+ stats = {}
203
+ if self.stats_file.exists():
204
+ try:
205
+ with self.stats_file.open("r", encoding="utf-8") as f:
206
+ stats = json.load(f)
207
+ except json.JSONDecodeError:
208
+ stats = {}
209
+
210
+ # Initialiser la structure si nécessaire
211
+ if "total_sessions" not in stats:
212
+ stats = {
213
+ "total_sessions": 0,
214
+ "total_duration_seconds": 0,
215
+ "average_duration_seconds": 0,
216
+ "artworks_recommended": {},
217
+ "recommendation_types": {
218
+ "name_date_place": 0,
219
+ "emotions": 0,
220
+ "none": 0,
221
+ },
222
+ "first_session": session_entry["datetime"],
223
+ "last_session": session_entry["datetime"],
224
+ }
225
+
226
+ # Mettre à jour les compteurs
227
+ stats["total_sessions"] += 1
228
+ stats["total_duration_seconds"] += session_entry.get("duration_seconds", 0)
229
+ stats["average_duration_seconds"] = (
230
+ stats["total_duration_seconds"] / stats["total_sessions"]
231
+ )
232
+ stats["last_session"] = session_entry["datetime"]
233
+
234
+ # Compter les types de recommandation
235
+ rec_type = session_entry.get("recommendation_type", "none")
236
+ if rec_type in stats["recommendation_types"]:
237
+ stats["recommendation_types"][rec_type] += 1
238
+
239
+ # Compter les œuvres recommandées
240
+ artwork = session_entry.get("recommended_artwork")
241
+ if artwork and artwork != "Aucune œuvre trouvée":
242
+ if artwork not in stats["artworks_recommended"]:
243
+ stats["artworks_recommended"][artwork] = 0
244
+ stats["artworks_recommended"][artwork] += 1
245
+
246
+ # Trouver l'œuvre la plus populaire
247
+ if stats["artworks_recommended"]:
248
+ most_popular = max(
249
+ stats["artworks_recommended"].items(), key=lambda x: x[1]
250
+ )
251
+ stats["most_popular_artwork"] = {
252
+ "title": most_popular[0],
253
+ "count": most_popular[1],
254
+ "percentage": (most_popular[1] / stats["total_sessions"]) * 100,
255
+ }
256
+
257
+ # Calculer les pourcentages d'utilisation
258
+ total = stats["total_sessions"]
259
+ if total > 0:
260
+ stats["recommendation_percentages"] = {
261
+ k: (v / total) * 100 for k, v in stats["recommendation_types"].items()
262
+ }
263
+
264
+ stats["last_updated"] = datetime.now().isoformat()
265
+
266
+ # Sauvegarder les stats mises à jour
267
+ with self.stats_file.open("w", encoding="utf-8") as f:
268
+ json.dump(stats, f, indent=2, ensure_ascii=False)
269
+
270
+ def get_statistics(self) -> dict:
271
+ """Retourne les statistiques globales"""
272
+ if self.stats_file.exists():
273
+ try:
274
+ with self.stats_file.open("r", encoding="utf-8") as f:
275
+ return json.load(f)
276
+ except Exception as e:
277
+ logger.error(f"Erreur lecture stats: {e}")
278
+ return {}
279
+
280
+
281
+ # Initialiser le logger de sessions
282
+ session_tracker = SessionLogger()
283
+
284
+
285
+ class SecurityValidator:
286
+ """Classe pour centraliser les validations de sécurité"""
287
+
288
+ PATH_TRAVERSAL_PATTERN = re.compile(r"\.\.|\.\/")
289
+ VALID_FILENAME_PATTERN = re.compile(r"^[\w\-\.\s]+$")
290
+ VALID_INPUT_PATTERN = re.compile(
291
+ r"^[\w\-\s\'\.,àâäéèêëïîôûùüÿæœçÀÂÄÉÈÊËÏÎÔÛÙÜŸÆŒÇ]+$", re.UNICODE
292
+ )
293
+ DATE_PATTERN = re.compile(r"^\d{1,2}/\d{1,2}$")
294
+
295
+ @classmethod
296
+ def validate_filename(cls, filename: str) -> bool:
297
+ """Valide qu'un nom de fichier est sécurisé"""
298
+ if not filename:
299
+ return False
300
+
301
+ # Vérifier les tentatives de path traversal
302
+ if cls.PATH_TRAVERSAL_PATTERN.search(filename):
303
+ logger.warning(f"Tentative de path traversal détectée: {filename}")
304
+ return False
305
+
306
+ # Vérifier que le nom ne contient que des caractères autorisés
307
+ base_name = os.path.basename(filename)
308
+ if not cls.VALID_FILENAME_PATTERN.match(base_name):
309
+ logger.warning(f"Nom de fichier invalide: {filename}")
310
+ return False
311
+
312
+ return True
313
+
314
+ @classmethod
315
+ def sanitize_input(cls, input_str: str, max_length: int = 100) -> str:
316
+ """Nettoie et valide une entrée utilisateur"""
317
+ if not input_str:
318
+ return ""
319
+
320
+ # Tronquer si trop long
321
+ input_str = input_str[:max_length].strip()
322
+
323
+ if not cls.VALID_INPUT_PATTERN.match(input_str):
324
+ # Garder seulement les caractères valides
325
+ cleaned = "".join(c for c in input_str if cls.VALID_INPUT_PATTERN.match(c))
326
+ logger.info(f"Input sanitized: '{input_str}' -> '{cleaned}'")
327
+ return cleaned
328
+
329
+ return input_str
330
+
331
+ @classmethod
332
+ def validate_date(cls, date_str: str) -> Tuple[bool, Optional[datetime]]:
333
+ """Valide et parse une date au format JJ/MM"""
334
+ if not date_str:
335
+ return False, None
336
+
337
+ if not cls.DATE_PATTERN.match(date_str):
338
+ return False, None
339
+
340
+ try:
341
+ day, month = map(int, date_str.split("/"))
342
+ if not (1 <= day <= 31 and 1 <= month <= 12):
343
+ return False, None
344
+
345
+ date_obj = datetime(year=2000, month=month, day=day)
346
+ return True, date_obj
347
+ except (ValueError, Exception) as e:
348
+ logger.error(f"Erreur de parsing de date: {e}")
349
+ return False, None
350
+
351
+
352
+ class ImageIndexer:
353
+ """Classe pour indexer et mapper les images depuis la base de données CSV"""
354
+
355
+ # Constants for better maintainability
356
+ IMAGE_EXTENSIONS = (".jpg", ".png")
357
+ COMMON_SUFFIXES = [".jpg", ".png", "_medium"]
358
+ MAR_BVM_TEST_SUFFIXES = ["-001", "-002", "-003"]
359
+
360
+ def __init__(self, images_dir: str):
361
+ self.images_dir = os.path.abspath(images_dir)
362
+ self.available_files = set()
363
+ self.image_lookup = {} # normalized_name -> filename
364
+ self.mar_bvm_lookup = {} # Special handling for MAR-BVM files
365
+ self._build_index()
366
+
367
+ def _strip_file_extensions(self, filename: str) -> str:
368
+ """Remove file extensions from filename"""
369
+ base_name = filename.lower()
370
+ if base_name.endswith("_medium.jpg"):
371
+ return base_name[:-11]
372
+ elif base_name.endswith((".jpg", ".png")):
373
+ return base_name[:-4]
374
+ return base_name
375
+
376
+ def _normalize_basic_patterns(self, name: str) -> str:
377
+ """Apply basic normalization patterns"""
378
+ # Remove trailing comma and normalize whitespace
379
+ normalized = name.lower().strip().rstrip(",")
380
+
381
+ # Remove common suffixes
382
+ for suffix in self.COMMON_SUFFIXES:
383
+ if normalized.endswith(suffix):
384
+ normalized = normalized[: -len(suffix)]
385
+
386
+ # Normalize spaces and underscores to dashes
387
+ return re.sub(r"[\s_]+", "-", normalized)
388
+
389
+ def _normalize_mar_bvm_format(self, name: str) -> str:
390
+ """Handle MAR-BVM specific normalization"""
391
+ if "mar-bvm" not in name:
392
+ return name
393
+
394
+ # Replace .0. with -0- and remaining dots with dashes
395
+ return name.replace(".0.", "-0-").replace(".", "-")
396
+
397
+ def _normalize_name(self, name: str) -> str:
398
+ """Normalise un nom pour la comparaison"""
399
+ normalized = self._normalize_basic_patterns(name)
400
+
401
+ # Special handling for MAR-BVM format
402
+ if "mar-bvm" in normalized:
403
+ normalized = self._normalize_mar_bvm_format(normalized)
404
+ # For files starting with year (like 2022.0.86), keep dots
405
+ elif not normalized.startswith("20"):
406
+ normalized = normalized.replace(".", "-")
407
+
408
+ return normalized
409
+
410
+ def _create_mar_bvm_lookups(self, normalized: str, filename: str):
411
+ """Create additional lookup entries for MAR-BVM files"""
412
+ if "mar-bvm" not in normalized:
413
+ return
414
+
415
+ parts = normalized.split("-")
416
+ for i, part in enumerate(parts):
417
+ if part.isdigit() and i >= 5: # After mar-bvm-7-2022-0
418
+ base_key = "-".join(parts[:6]) # mar-bvm-7-2022-0-22
419
+ if base_key not in self.mar_bvm_lookup:
420
+ self.mar_bvm_lookup[base_key] = []
421
+ self.mar_bvm_lookup[base_key].append(filename)
422
+ break
423
+
424
+ def _process_image_file(self, filename: str):
425
+ """Process a single image file for indexing"""
426
+ if not SecurityValidator.validate_filename(filename):
427
+ logger.warning(f"Fichier ignoré pour raison de sécurité: {filename}")
428
+ return
429
+
430
+ if not filename.lower().endswith(self.IMAGE_EXTENSIONS):
431
+ return
432
+
433
+ self.available_files.add(filename)
434
+
435
+ base_name = self._strip_file_extensions(filename)
436
+ normalized = self._normalize_name(base_name)
437
+ self.image_lookup[normalized] = filename
438
+ self._create_mar_bvm_lookups(normalized, filename)
439
+
440
+ def _build_index(self):
441
+ """Construit un index des images disponibles"""
442
+ try:
443
+ all_files = os.listdir(self.images_dir)
444
+ for filename in all_files:
445
+ self._process_image_file(filename)
446
+
447
+ logger.info(
448
+ f"Index des images construit: {len(self.available_files)} fichiers disponibles, "
449
+ f"{len(self.image_lookup)} entrées normalisées"
450
+ )
451
+ except Exception as e:
452
+ logger.error(f"Erreur lors de la construction de l'index: {e}")
453
+ self.available_files = set()
454
+
455
+ def _clean_input_name(self, image_name: str) -> str:
456
+ """Clean and prepare input name for processing"""
457
+ # Basic cleaning
458
+ cleaned = image_name.strip().rstrip(",").rstrip("-").strip()
459
+ # Remove spaces before -001, -002, etc.
460
+ return re.sub(r"\s+(-\d)", r"\1", cleaned)
461
+
462
+ def _normalize_mar_bvm_input(self, image_name: str) -> str:
463
+ """Handle MAR-BVM specific input normalization"""
464
+ if "MAR-BVM" not in image_name:
465
+ return image_name
466
+
467
+ # Handle missing "7-" in MAR-BVM-2022-0-153
468
+ if "MAR-BVM-2022-0-" in image_name:
469
+ image_name = image_name.replace("MAR-BVM-2022-0-", "MAR-BVM-7-2022-0-")
470
+
471
+ # Convert .0. to -0-
472
+ if ".0." in image_name:
473
+ image_name = image_name.replace(".0.", "-0-")
474
+
475
+ # Handle .001, .002 at the end (convert to -001, -002)
476
+ image_name = re.sub(r"\.(\d{3})$", r"-\1", image_name)
477
+
478
+ # Handle .1 or .2 suffix
479
+ if image_name.endswith(".1"):
480
+ image_name = image_name[:-2] + "-1"
481
+ elif image_name.endswith(".2"):
482
+ image_name = image_name[:-2] + "-2"
483
+
484
+ # Replace any remaining dots with dashes (but be careful not to mess up already processed parts)
485
+ return image_name.replace(".", "-")
486
+
487
+ def _try_mar_bvm_lookups(self, normalized: str) -> Optional[str]:
488
+ """Try various MAR-BVM specific lookup strategies"""
489
+ # Check special MAR-BVM lookup
490
+ if normalized in self.mar_bvm_lookup and self.mar_bvm_lookup[normalized]:
491
+ return self.mar_bvm_lookup[normalized][0]
492
+
493
+ # Try with suffix variations
494
+ for suffix in self.MAR_BVM_TEST_SUFFIXES:
495
+ test_pattern = f"{normalized}{suffix}"
496
+ if test_pattern in self.image_lookup:
497
+ return self.image_lookup[test_pattern]
498
+
499
+ return None
500
+
501
+ def _try_year_format_lookup(self, image_name: str) -> Optional[str]:
502
+ """Handle special case for files starting with year"""
503
+ if not image_name.startswith("20"):
504
+ return None
505
+
506
+ test_name = image_name.lower().replace(" ", "-")
507
+ return self.image_lookup.get(test_name)
508
+
509
+ def _try_partial_matching(self, normalized: str) -> Optional[str]:
510
+ """Try partial matching as last resort"""
511
+ for key, filename in self.image_lookup.items():
512
+ if key.startswith(normalized) or normalized in key:
513
+ return filename
514
+ return None
515
+
516
+ def _split_multiple_names(self, image_name: str) -> List[str]:
517
+ """Split image names that contain multiple names separated by commas or slashes"""
518
+ # First try comma separation
519
+ if "," in image_name:
520
+ return [name.strip() for name in image_name.split(",") if name.strip()]
521
+
522
+ # Then try slash separation
523
+ if "/" in image_name:
524
+ return [name.strip() for name in image_name.split("/") if name.strip()]
525
+
526
+ # Handle " - " separation (for cases like "MAR-BVM-7-2022.0.81 - 2022.0.81")
527
+ if " - " in image_name and image_name.count(" - ") == 1:
528
+ parts = [name.strip() for name in image_name.split(" - ")]
529
+ # Only use the first part if they look like duplicates
530
+ if len(parts) == 2:
531
+ first, second = parts
532
+ # Check if second part is a suffix of the first (like duplicate year)
533
+ if first.endswith(second) or second in first:
534
+ return [first]
535
+ return parts
536
+
537
+ return [image_name]
538
+
539
+ def find_image(self, image_name: str) -> Optional[str]:
540
+ """Trouve un fichier image correspondant au nom donné"""
541
+ if not image_name:
542
+ return None
543
+
544
+ # Handle multiple image names in one field
545
+ possible_names = self._split_multiple_names(image_name)
546
+
547
+ # Try each name individually
548
+ for name in possible_names:
549
+ result = self._find_single_image(name)
550
+ if result:
551
+ return result
552
+
553
+ return None
554
+
555
+ def _find_single_image(self, image_name: str) -> Optional[str]:
556
+ """Find a single image by name"""
557
+ # Clean and normalize the input
558
+ cleaned_name = self._clean_input_name(image_name)
559
+ processed_name = self._normalize_mar_bvm_input(cleaned_name)
560
+ normalized = self._normalize_name(processed_name)
561
+
562
+ # Try direct lookup first
563
+ if normalized in self.image_lookup:
564
+ return self.image_lookup[normalized]
565
+
566
+ # Try MAR-BVM specific lookups
567
+ if "mar-bvm" in normalized:
568
+ result = self._try_mar_bvm_lookups(normalized)
569
+ if result:
570
+ return result
571
+
572
+ # Try year format lookup
573
+ result = self._try_year_format_lookup(image_name)
574
+ if result:
575
+ return result
576
+
577
+ # Try partial matching as last resort
578
+ return self._try_partial_matching(normalized)
579
+
580
+ def get_all_files(self) -> Set[str]:
581
+ """Retourne tous les fichiers disponibles"""
582
+ return self.available_files.copy()
583
+
584
+
585
+ class ArtMatcherV2:
586
+ """Classe principale pour le matching d'œuvres d'art"""
587
+
588
+ def __init__(self, csv_path: str, images_dir: str):
589
+ """Initialise le système avec la base de données et le répertoire d'images"""
590
+ self.db = Database(csv_path)
591
+ self.images_dir = os.path.abspath(images_dir)
592
+ self.emotion_wheel = EmotionWheel()
593
+ self.weights = ScoringWeights()
594
+
595
+ self.optimizer_helper = WeightedLeximaxOptimizer(TargetProfile(), {})
596
+
597
+ self.image_indexer = ImageIndexer(images_dir)
598
+
599
+ df = self.db.get_dataframe()
600
+ self.df_with_images = df[
601
+ df["name_image"].notna()
602
+ & (df["name_image"] != "")
603
+ & (df["name_image"].str.strip() != "")
604
+ ].copy()
605
+
606
+ self.df_with_images["database_id_str"] = self.df_with_images[
607
+ "database_id"
608
+ ].astype(str)
609
+ self.id_to_index = {
610
+ str(row["database_id"]): idx for idx, row in self.df_with_images.iterrows()
611
+ }
612
+
613
+ self.artwork_images = self._build_artwork_image_index()
614
+
615
+ self.temp_db_with_images = Database.__new__(Database)
616
+ self.temp_db_with_images.dataframe = self.df_with_images
617
+
618
+ logger.info(f"Base de données chargée: {self.db.n_pieces()} œuvres")
619
+ logger.info(f"Œuvres avec images: {len(self.df_with_images)}")
620
+ logger.info(f"Index des images: {len(self.artwork_images)} œuvres mappées")
621
+
622
+ def _sanitize_input(self, input_str: str) -> str:
623
+ """Nettoie et valide une entrée utilisateur"""
624
+ return SecurityValidator.sanitize_input(input_str)
625
+
626
+ def _parse_date(self, date_str: str) -> Optional[datetime]:
627
+ """Parse une date avec validation"""
628
+ is_valid, date_obj = SecurityValidator.validate_date(date_str)
629
+ return date_obj if is_valid else None
630
+
631
+ def _build_artwork_image_index(self) -> Dict[str, List[str]]:
632
+ """Construit un index artwork_id -> [image_paths] au démarrage"""
633
+ artwork_images = {}
634
+
635
+ for idx, row in self.df_with_images.iterrows():
636
+ artwork_id = str(row["database_id"])
637
+ image_paths = []
638
+
639
+ if row["name_image"] and str(row["name_image"]).strip():
640
+ # Parse the image names - handle special separators
641
+ image_string = str(row["name_image"]).strip().strip('"')
642
+
643
+ # Handle cases with " / " or " - " separators
644
+ if " / " in image_string:
645
+ # Take first part before the slash
646
+ image_string = image_string.split(" / ")[0].strip()
647
+
648
+ # Special case: if it has " - 2022" it's a separator, not part of the name
649
+ if " - 2022" in image_string:
650
+ # Take the part before " - 2022"
651
+ image_string = image_string.split(" - 2022")[0].strip()
652
+ elif " - " in image_string and "MAR-BVM-7-2022-0-" not in image_string:
653
+ # For other MAR-BVM formats with " - " separator
654
+ parts = image_string.split(" - ")
655
+ if "MAR-BVM" in parts[0]:
656
+ image_string = parts[0].strip()
657
+
658
+ # Clean up trailing " -" or spaces before "-001"
659
+ image_string = re.sub(
660
+ r"\s+-\s*$", "", image_string
661
+ ) # Remove trailing " -"
662
+ image_string = re.sub(
663
+ r"\s+(-\d)", r"\1", image_string
664
+ ) # Remove spaces before -001
665
+
666
+ # Parse comma-separated list
667
+ images = [
668
+ img.strip()
669
+ for img in re.split(r"[,/]", image_string)
670
+ if img.strip()
671
+ ]
672
+
673
+ for img_name in images:
674
+ # Find the actual file for this image name
675
+ matched_file = self.image_indexer.find_image(img_name)
676
+ if matched_file:
677
+ img_path = os.path.join(self.images_dir, matched_file)
678
+ image_paths.append(img_path)
679
+
680
+ if image_paths:
681
+ artwork_images[artwork_id] = image_paths
682
+
683
+ return artwork_images
684
+
685
+ def preselect_artworks(
686
+ self, firstname: str, birthday: str, city: str
687
+ ) -> pd.DataFrame:
688
+ """
689
+ Pré-sélectionne les œuvres selon la hiérarchie: prénom > date > ville
690
+ """
691
+ logger.info("=== DÉBUT PRÉ-SÉLECTION ===")
692
+
693
+ # Nettoyer les entrées
694
+ firstname = self._sanitize_input(firstname)
695
+ city = self._sanitize_input(city)
696
+
697
+ logger.info(
698
+ f"Critères de pré-sélection: prénom='{firstname}', date='{birthday}', ville='{city}'"
699
+ )
700
+
701
+ birth_date = self._parse_date(birthday)
702
+ if birth_date:
703
+ logger.info(f"Date convertie: {birth_date.strftime('%d/%m')}")
704
+
705
+ profile = TargetProfile()
706
+ profile.set_target_name(firstname)
707
+ profile.set_target_date(birth_date)
708
+ profile.set_target_place(city)
709
+
710
+ weights = {
711
+ "related_names": self.weights.PRESELECTION_NAME_WEIGHT,
712
+ "related_dates": self.weights.PRESELECTION_DATE_WEIGHT,
713
+ "related_places": self.weights.PRESELECTION_PLACE_WEIGHT,
714
+ "related_emotions": self.weights.PRESELECTION_EMOTION_WEIGHT,
715
+ }
716
+
717
+ logger.info(
718
+ f"Poids utilisés: nom={weights['related_names']}, date={weights['related_dates']}, lieu={weights['related_places']}, émotions={weights['related_emotions']}"
719
+ )
720
+
721
+ optimizer = WeightedLeximaxOptimizer(profile, weights)
722
+ result = optimizer.optimize_max(self.temp_db_with_images)
723
+
724
+ preselected = result[result["score"] > (0, 0, 0)]
725
+ logger.info(f"Œuvres avec score > 0: {len(preselected)}")
726
+
727
+ if len(preselected) < self.weights.MIN_PRESELECTION_COUNT:
728
+ preselected = result.head(self.weights.MIN_PRESELECTION_COUNT)
729
+ logger.info(f"Ajustement au minimum requis: {len(preselected)} œuvres")
730
+
731
+ logger.info("Top 5 pré-sélections:")
732
+ for i, (idx, piece) in enumerate(preselected.head(5).iterrows()):
733
+ logger.info(
734
+ f" {i+1}. Œuvre #{piece['database_id']} - Score: {piece['score']}"
735
+ )
736
+ if firstname and piece["related_names"]:
737
+ name_score = Optimizer.name_similarity(
738
+ firstname, piece["related_names"]
739
+ )
740
+ if name_score > 0:
741
+ logger.info(
742
+ f" → Nom: {piece['related_names']} (score: {name_score:.2f})"
743
+ )
744
+ if birth_date and piece["related_dates"]:
745
+ date_score = Optimizer.date_similarity(
746
+ birth_date, piece["related_dates"]
747
+ )
748
+ if date_score > 0:
749
+ logger.info(
750
+ f" → Dates: {[d.strftime('%d/%m') for d in piece['related_dates']]} (score: {date_score:.2f})"
751
+ )
752
+ if city and piece["related_places"]:
753
+ place_score = self.optimizer_helper.place_similarity(
754
+ city, piece["related_places"]
755
+ )
756
+ if place_score > 0:
757
+ logger.info(
758
+ f" → Lieux: {piece['related_places']} (score: {place_score:.2f})"
759
+ )
760
+
761
+ logger.info("=== FIN PRÉ-SÉLECTION ===")
762
+ return preselected
763
+
764
+ def get_random_images_for_selection(
765
+ self, round_num: int, already_selected: List[str] = None
766
+ ) -> List[Tuple[str, str]]:
767
+ """
768
+ Retourne 3 images aléatoires depuis l'index pré-construit
769
+ Exclut les œuvres déjà sélectionnées dans les tours précédents
770
+ """
771
+ logger.info(f"=== SÉLECTION D'IMAGES POUR LE TOUR {round_num} ===")
772
+
773
+ if already_selected:
774
+ logger.info(f"Œuvres déjà sélectionnées à exclure: {already_selected}")
775
+
776
+ available_artworks = list(self.artwork_images.keys())
777
+
778
+ # Exclure les œuvres déjà sélectionnées
779
+ if already_selected:
780
+ already_selected_set = set(already_selected)
781
+ available_artworks = [
782
+ a for a in available_artworks if a not in already_selected_set
783
+ ]
784
+
785
+ logger.info(
786
+ f"Nombre total d'œuvres avec images disponibles: {len(available_artworks)}"
787
+ )
788
+
789
+ if len(available_artworks) < self.weights.MAX_IMAGES_PER_SELECTION:
790
+ logger.warning(
791
+ f"Seulement {len(available_artworks)} œuvres avec images disponibles"
792
+ )
793
+ direct_images = []
794
+ for filename in list(self.image_indexer.get_all_files())[:10]:
795
+ if filename.endswith(".jpg"):
796
+ img_path = os.path.join(self.images_dir, filename)
797
+ direct_images.append((img_path, "0"))
798
+ return direct_images[: self.weights.MAX_IMAGES_PER_SELECTION]
799
+
800
+ num_to_select = min(
801
+ self.weights.MAX_IMAGES_PER_SELECTION, len(available_artworks)
802
+ )
803
+ selected_artworks = random.sample(available_artworks, num_to_select)
804
+
805
+ logger.info(f"Œuvres sélectionnées aléatoirement: {selected_artworks}")
806
+
807
+ selected = []
808
+ for artwork_id in selected_artworks:
809
+ img_path = random.choice(self.artwork_images[artwork_id])
810
+ selected.append((img_path, artwork_id))
811
+ if artwork_id in self.id_to_index:
812
+ idx = self.id_to_index[artwork_id]
813
+ artwork = self.df_with_images.loc[idx]
814
+ logger.info(f" Image {len(selected)}: Œuvre #{artwork_id}")
815
+ logger.info(f" Type: {artwork['art_piece_type']}")
816
+ logger.info(f" Émotions: {artwork['related_emotions']}")
817
+
818
+ logger.info(f"=== FIN SÉLECTION IMAGES TOUR {round_num} ===")
819
+ return selected
820
+
821
+ def extract_emotions_from_image_id(self, database_id: str) -> List[str]:
822
+ """
823
+ Extrait les émotions associées à une œuvre via son ID
824
+ Utilise l'index pré-calculé pour éviter les conversions répétées
825
+ """
826
+ if database_id in self.id_to_index:
827
+ idx = self.id_to_index[database_id]
828
+ emotions = self.df_with_images.loc[idx, "related_emotions"]
829
+ if isinstance(emotions, list):
830
+ return emotions
831
+ return []
832
+
833
+ @lru_cache(maxsize=1024)
834
+ def _cached_emotion_similarity(self, emotion1: str, emotion2: str) -> float:
835
+ """Cache les calculs de similarité émotionnelle"""
836
+ return self.emotion_wheel.calculate_emotion_similarity(emotion1, emotion2)
837
+
838
+ def calculate_emotion_profile(self, selected_ids: List[str]) -> Dict[str, float]:
839
+ """
840
+ Calcule le profil émotionnel basé sur les images sélectionnées
841
+ """
842
+ logger.info("=== CALCUL DU PROFIL ÉMOTIONNEL ===")
843
+ logger.info(f"Images sélectionnées: {selected_ids}")
844
+
845
+ emotion_counter = Counter()
846
+
847
+ for db_id in selected_ids:
848
+ emotions = self.extract_emotions_from_image_id(db_id)
849
+ logger.info(f" Image {db_id}: émotions = {emotions}")
850
+ emotion_counter.update(emotions)
851
+
852
+ total = sum(emotion_counter.values())
853
+ if total > 0:
854
+ emotion_profile = {
855
+ emotion: count / total for emotion, count in emotion_counter.items()
856
+ }
857
+ logger.info(f"Profil émotionnel calculé: {emotion_profile}")
858
+ else:
859
+ emotion_profile = {}
860
+ logger.info("Aucune émotion trouvée dans les images sélectionnées")
861
+
862
+ logger.info("=== FIN CALCUL PROFIL ÉMOTIONNEL ===")
863
+ return emotion_profile
864
+
865
+ def _get_artwork_image(self, artwork) -> Optional[str]:
866
+ """Retourne le chemin de l'image pour une œuvre d'art"""
867
+ artwork_id = str(artwork["database_id"])
868
+
869
+ # Simply return the first image from our pre-built index
870
+ if artwork_id in self.artwork_images:
871
+ return self.artwork_images[artwork_id][0]
872
+
873
+ return None
874
+
875
+ def find_best_match(
876
+ self, firstname: str, birthday: str, city: str, selected_image_ids: List[str]
877
+ ) -> Tuple[Optional[str], str, Dict]:
878
+ """
879
+ Trouve la meilleure correspondance selon la hiérarchie du scénario:
880
+ 1. Match exact (name/date/city) = gagnant automatique
881
+ 2. Si pré-sélection existe: utiliser émotions pour départager
882
+ 3. Si aucune pré-sélection: utiliser émotions seules
883
+ 4. Type d'objet comme critère de départage final
884
+ """
885
+ firstname = self._sanitize_input(firstname)
886
+ city = self._sanitize_input(city)
887
+ birth_date = self._parse_date(birthday)
888
+
889
+ logger.info(
890
+ f"Recherche de correspondance pour: {firstname}, {birthday}, {city}"
891
+ )
892
+
893
+ preselected = self.preselect_artworks(firstname, birthday, city)
894
+
895
+ logger.info("=== DÉTECTION DE MATCH EXACT ===")
896
+ for idx, piece in preselected.iterrows():
897
+ if firstname and piece["related_names"]:
898
+ name_score = Optimizer.name_similarity(
899
+ firstname, piece["related_names"]
900
+ )
901
+ if name_score >= 0.95:
902
+ logger.info(
903
+ f"🎯 MATCH EXACT TROUVÉ: prénom '{firstname}' → œuvre #{piece['database_id']} (score: {name_score:.2f})"
904
+ )
905
+ logger.info(f" Noms dans l'œuvre: {piece['related_names']}")
906
+ match_image = self._get_artwork_image(piece)
907
+ match_info = {
908
+ "title": f"Œuvre #{piece['database_id']}",
909
+ "type": piece["art_piece_type"],
910
+ "place": piece["art_piece_place"],
911
+ "emotions": piece["related_emotions"],
912
+ "explanation": piece["explanation"],
913
+ }
914
+ return (
915
+ match_image,
916
+ f"Prénom '{firstname}' correspond exactement",
917
+ match_info,
918
+ )
919
+
920
+ if birth_date and piece["related_dates"]:
921
+ date_score = Optimizer.date_similarity(
922
+ birth_date, piece["related_dates"]
923
+ )
924
+ if date_score == 1.0:
925
+ logger.info(
926
+ f"🎯 MATCH EXACT TROUVÉ: date '{birthday}' → œuvre #{piece['database_id']}"
927
+ )
928
+ logger.info(
929
+ f" Dates dans l'œuvre: {[d.strftime('%d/%m/%Y') for d in piece['related_dates']]}"
930
+ )
931
+ match_image = self._get_artwork_image(piece)
932
+ match_info = {
933
+ "title": f"Œuvre #{piece['database_id']}",
934
+ "type": piece["art_piece_type"],
935
+ "place": piece["art_piece_place"],
936
+ "emotions": piece["related_emotions"],
937
+ "explanation": piece["explanation"],
938
+ }
939
+ return (
940
+ match_image,
941
+ f"Date d'anniversaire {birthday} correspond exactement",
942
+ match_info,
943
+ )
944
+
945
+ if city and piece["related_places"]:
946
+ place_score = self.optimizer_helper.place_similarity(
947
+ city, piece["related_places"]
948
+ )
949
+ if place_score == 1.0:
950
+ logger.info(
951
+ f"🎯 MATCH EXACT TROUVÉ: ville '{city}' → œuvre #{piece['database_id']}"
952
+ )
953
+ logger.info(f" Lieux dans l'œuvre: {piece['related_places']}")
954
+ match_image = self._get_artwork_image(piece)
955
+ match_info = {
956
+ "title": f"Œuvre #{piece['database_id']}",
957
+ "type": piece["art_piece_type"],
958
+ "place": piece["art_piece_place"],
959
+ "emotions": piece["related_emotions"],
960
+ "explanation": piece["explanation"],
961
+ }
962
+ return (
963
+ match_image,
964
+ f"Ville '{city}' correspond exactement",
965
+ match_info,
966
+ )
967
+
968
+ logger.info("Aucun match exact trouvé, passage à la sélection par émotions")
969
+
970
+ emotion_profile = self.calculate_emotion_profile(selected_image_ids)
971
+
972
+ logger.info("=== STRATÉGIE DE MATCHING ===")
973
+ valid_preselection = preselected[preselected["score"] > (0, 0, 0)]
974
+
975
+ if len(valid_preselection) > 0:
976
+ logger.info(
977
+ f"📋 CAS A: {len(valid_preselection)} œuvres pré-sélectionnées - utilisation des émotions pour départager"
978
+ )
979
+ candidates = valid_preselection
980
+ else:
981
+ logger.info(
982
+ f"📋 CAS B: Aucune pré-sélection valide - recherche par émotions sur {len(self.df_with_images)} œuvres"
983
+ )
984
+ candidates = self.df_with_images
985
+
986
+ # Exclure les œuvres déjà sélectionnées par l'utilisateur
987
+ selected_artwork_ids = set(selected_image_ids)
988
+ candidates = candidates[
989
+ ~candidates["database_id"].astype(str).isin(selected_artwork_ids)
990
+ ]
991
+ logger.info(
992
+ f"Après exclusion des œuvres déjà sélectionnées {selected_artwork_ids}: {len(candidates)} candidats restants"
993
+ )
994
+
995
+ logger.info("=== CALCUL DES SCORES ÉMOTIONNELS ===")
996
+ best_matches = []
997
+ best_emotion_score = -1
998
+
999
+ for idx, piece in candidates.iterrows():
1000
+ emotion_score = 0
1001
+
1002
+ if emotion_profile and piece["related_emotions"]:
1003
+ for user_emotion, weight in emotion_profile.items():
1004
+ best_similarity = 0
1005
+ for piece_emotion in piece["related_emotions"]:
1006
+ similarity = self._cached_emotion_similarity(
1007
+ user_emotion, piece_emotion
1008
+ )
1009
+ if similarity > best_similarity:
1010
+ best_similarity = similarity
1011
+ emotion_score += best_similarity * weight
1012
+
1013
+ if len(piece["related_emotions"]) > 0:
1014
+ emotion_score /= len(piece["related_emotions"])
1015
+
1016
+ if emotion_score > best_emotion_score:
1017
+ best_emotion_score = emotion_score
1018
+ best_matches = [piece]
1019
+ logger.info(
1020
+ f" Nouveau meilleur score émotionnel: {emotion_score:.3f} - Œuvre #{piece['database_id']}"
1021
+ )
1022
+ elif emotion_score == best_emotion_score and emotion_score > 0:
1023
+ best_matches.append(piece)
1024
+ logger.info(
1025
+ f" Score égal au meilleur: {emotion_score:.3f} - Œuvre #{piece['database_id']}"
1026
+ )
1027
+
1028
+ logger.info(
1029
+ f"Nombre de meilleures correspondances: {len(best_matches)} avec score {best_emotion_score:.3f}"
1030
+ )
1031
+
1032
+ if len(best_matches) > 1:
1033
+ logger.info("=== DÉPARTAGE PAR TYPE D'OBJET ===")
1034
+ selected_types = []
1035
+ for img_id in selected_image_ids:
1036
+ if img_id in self.id_to_index:
1037
+ idx = self.id_to_index[img_id]
1038
+ selected_types.append(
1039
+ self.df_with_images.loc[idx, "art_piece_type"]
1040
+ )
1041
+
1042
+ selected_types_counter = Counter(selected_types)
1043
+
1044
+ type_scored_matches = []
1045
+ best_type_score = -1
1046
+
1047
+ for piece in best_matches:
1048
+ type_score = selected_types_counter.get(piece["art_piece_type"], 0)
1049
+ if type_score > best_type_score:
1050
+ best_type_score = type_score
1051
+ type_scored_matches = [piece]
1052
+ elif type_score == best_type_score:
1053
+ type_scored_matches.append(piece)
1054
+
1055
+ if len(type_scored_matches) > 1:
1056
+ logger.info(
1057
+ f" {len(type_scored_matches)} œuvres avec le même score de type ({best_type_score}) - sélection aléatoire"
1058
+ )
1059
+ best_match = random.choice(type_scored_matches)
1060
+ match_reason = (
1061
+ "Sélection aléatoire parmi les meilleures correspondances"
1062
+ )
1063
+ else:
1064
+ best_match = type_scored_matches[0]
1065
+ match_reason = f"Type d'objet '{best_match['art_piece_type']}' préféré"
1066
+ logger.info(
1067
+ f" Type '{best_match['art_piece_type']}' sélectionné avec score {best_type_score}"
1068
+ )
1069
+ elif len(best_matches) == 1:
1070
+ best_match = best_matches[0]
1071
+ match_reason = "Meilleure correspondance émotionnelle"
1072
+ else:
1073
+ logger.info("Aucune correspondance trouvée")
1074
+ return None, "Aucune correspondance trouvée", {}
1075
+
1076
+ reasons = []
1077
+ if len(valid_preselection) > 0:
1078
+ if firstname and best_match["related_names"]:
1079
+ name_score = Optimizer.name_similarity(
1080
+ firstname, best_match["related_names"]
1081
+ )
1082
+ if name_score > 0:
1083
+ reasons.append(f"prénom '{firstname}' trouvé")
1084
+
1085
+ if birth_date and best_match["related_dates"]:
1086
+ date_score = Optimizer.date_similarity(
1087
+ birth_date, best_match["related_dates"]
1088
+ )
1089
+ if date_score > 0:
1090
+ reasons.append(
1091
+ f"date {'exacte' if date_score == 1.0 else 'partielle'}"
1092
+ )
1093
+
1094
+ if city and best_match["related_places"]:
1095
+ place_score = self.optimizer_helper.place_similarity(
1096
+ city, best_match["related_places"]
1097
+ )
1098
+ if place_score > 0:
1099
+ reasons.append(f"ville '{city}' trouvée")
1100
+
1101
+ if best_emotion_score > 0:
1102
+ reasons.append(f"correspondance émotionnelle")
1103
+
1104
+ if len(reasons) == 0:
1105
+ reasons.append(match_reason)
1106
+
1107
+ final_reason = " ; ".join(reasons)
1108
+
1109
+ logger.info(f"\n🏆 RÉSULTAT FINAL: Œuvre #{best_match['database_id']}")
1110
+ logger.info(f" Raison: {final_reason}")
1111
+ logger.info(f" Type: {best_match['art_piece_type']}")
1112
+ logger.info(f" Lieu: {best_match['art_piece_place']}")
1113
+
1114
+ match_image = self._get_artwork_image(best_match)
1115
+
1116
+ match_info = {
1117
+ "title": f"Œuvre #{best_match['database_id']}",
1118
+ "type": best_match["art_piece_type"],
1119
+ "place": best_match["art_piece_place"],
1120
+ "emotions": best_match["related_emotions"],
1121
+ "explanation": best_match["explanation"],
1122
+ }
1123
+
1124
+ return match_image, final_reason, match_info
1125
+
1126
+
1127
+ csv_path = "PP1-Collection_Database_new-cleaned.csv"
1128
+ images_dir = "pictures_data"
1129
+
1130
+ if not os.path.exists(csv_path):
1131
+ logger.error(f"Fichier CSV introuvable: {csv_path}")
1132
+ if not os.path.exists(images_dir):
1133
+ logger.error(f"Répertoire images introuvable: {images_dir}")
1134
+
1135
+ matcher = ArtMatcherV2(csv_path, images_dir)
1136
+
1137
+
1138
+ def process_user_info(firstname: str, birthday: str, city: str, state: SessionState):
1139
+ """Traite les informations utilisateur avec validation"""
1140
+ firstname = SecurityValidator.sanitize_input(firstname)
1141
+ city = SecurityValidator.sanitize_input(city)
1142
+
1143
+ state.firstname = firstname
1144
+ state.birthday = birthday
1145
+ state.city = city
1146
+
1147
+ if not firstname or not birthday:
1148
+ return (
1149
+ gr.update(visible=True),
1150
+ gr.update(visible=False),
1151
+ gr.update(visible=False),
1152
+ "Veuillez remplir au moins votre prénom et date de naissance.",
1153
+ state,
1154
+ )
1155
+
1156
+ is_valid, _ = SecurityValidator.validate_date(birthday)
1157
+ if not is_valid:
1158
+ return (
1159
+ gr.update(visible=True),
1160
+ gr.update(visible=False),
1161
+ gr.update(visible=False),
1162
+ "Format de date invalide. Utilisez JJ/MM (ex: 15/03)",
1163
+ state,
1164
+ )
1165
+
1166
+ return (
1167
+ gr.update(visible=False),
1168
+ gr.update(visible=True),
1169
+ gr.update(visible=False),
1170
+ "Informations enregistrées ! Passons à la sélection d'images.",
1171
+ state,
1172
+ )
1173
+
1174
+
1175
+ def load_images_for_round(round_num: int, state: SessionState):
1176
+ """Charge 3 images pour un tour de sélection"""
1177
+ images_data = matcher.get_random_images_for_selection(
1178
+ round_num, state.selected_images
1179
+ )
1180
+
1181
+ if len(images_data) < ScoringWeights.MAX_IMAGES_PER_SELECTION:
1182
+ logger.warning(f"Seulement {len(images_data)} images disponibles")
1183
+ return (
1184
+ [None, None, None],
1185
+ [],
1186
+ f"Pas assez d'images disponibles (seulement {len(images_data)} trouvées)",
1187
+ state,
1188
+ )
1189
+
1190
+ images = [img[0] for img in images_data]
1191
+ ids = [img[1] for img in images_data]
1192
+
1193
+ state.current_image_ids = ids
1194
+
1195
+ return (
1196
+ images,
1197
+ ids,
1198
+ f"Tour {round_num + 1}/{ScoringWeights.TOTAL_ROUNDS} : Sélectionnez l'image qui vous attire le plus",
1199
+ state,
1200
+ )
1201
+
1202
+
1203
+ def select_image(choice: Optional[int], state: SessionState):
1204
+ """Traite la sélection d'image"""
1205
+ if choice is None:
1206
+ return (
1207
+ gr.update(),
1208
+ gr.update(),
1209
+ gr.update(),
1210
+ gr.update(),
1211
+ "Veuillez sélectionner une image",
1212
+ state,
1213
+ )
1214
+
1215
+ if state.current_image_ids and len(state.current_image_ids) > choice:
1216
+ selected_id = state.current_image_ids[choice]
1217
+ else:
1218
+ return (
1219
+ gr.update(),
1220
+ gr.update(),
1221
+ gr.update(),
1222
+ gr.update(),
1223
+ "Erreur: image non trouvée",
1224
+ state,
1225
+ )
1226
+
1227
+ state.selected_images.append(selected_id)
1228
+ state.current_round += 1
1229
+
1230
+ logger.info(
1231
+ f"Tour {state.current_round}: Image {choice+1} sélectionnée (ID: {selected_id})"
1232
+ )
1233
+
1234
+ if state.current_round < ScoringWeights.TOTAL_ROUNDS:
1235
+ new_images, new_ids, message, state = load_images_for_round(
1236
+ state.current_round, state
1237
+ )
1238
+ return (
1239
+ gr.update(value=new_images[0]),
1240
+ gr.update(value=new_images[1]),
1241
+ gr.update(value=new_images[2]),
1242
+ gr.update(value=None),
1243
+ message,
1244
+ state,
1245
+ gr.update(visible=True), # keep selection_section visible
1246
+ gr.update(visible=False), # keep loading_section hidden
1247
+ )
1248
+ else:
1249
+ # Toutes les sélections sont terminées, afficher le loading
1250
+ return (
1251
+ gr.update(), # img1
1252
+ gr.update(), # img2
1253
+ gr.update(), # img3
1254
+ gr.update(), # image_choice
1255
+ "", # status_message vide
1256
+ state,
1257
+ gr.update(visible=False), # hide selection_section
1258
+ gr.update(visible=True), # show loading_section
1259
+ )
1260
+
1261
+
1262
+ def show_results(state: SessionState):
1263
+ """Affiche les résultats finaux"""
1264
+ if not state.is_complete():
1265
+ return (
1266
+ gr.update(visible=False), # info_section
1267
+ gr.update(visible=True), # selection_section
1268
+ gr.update(visible=False), # loading_section
1269
+ gr.update(visible=False), # results_section
1270
+ None,
1271
+ "",
1272
+ "",
1273
+ )
1274
+
1275
+ match_image, reason, info = matcher.find_best_match(
1276
+ state.firstname,
1277
+ state.birthday,
1278
+ state.city,
1279
+ state.selected_images,
1280
+ )
1281
+
1282
+ if match_image:
1283
+ # Déterminer le type de système de recommandation utilisé
1284
+ if "correspond exactement" in reason.lower():
1285
+ # Match exact sur nom, date ou lieu
1286
+ recommendation_type = "name_date_place"
1287
+ else:
1288
+ # Match basé sur les émotions
1289
+ recommendation_type = "emotions"
1290
+
1291
+ # Enregistrer l'œuvre finale et le type de recommandation
1292
+ state.final_artwork = info.get("title", "Œuvre inconnue")
1293
+ state.recommendation_type = recommendation_type
1294
+
1295
+ # Logger la session
1296
+ session_tracker.log_session(state, recommendation_type)
1297
+
1298
+ explanation = f"""
1299
+ **Votre œuvre correspondante a été trouvée !**
1300
+
1301
+ **Raison du match :** {reason}
1302
+
1303
+ **Détails de l'œuvre :**
1304
+ - Type : {info.get('type', 'Non spécifié')}
1305
+ - Lieu : {info.get('place', 'Non spécifié')}
1306
+ - Émotions : {', '.join(info.get('emotions', [])) if info.get('emotions') else 'Non spécifiées'}
1307
+
1308
+ **Description :**
1309
+ {info.get('explanation', 'Aucune description disponible')}
1310
+ """
1311
+ else:
1312
+ # Aucune œuvre trouvée - logger quand même
1313
+ state.final_artwork = "Aucune œuvre trouvée"
1314
+ state.recommendation_type = "none"
1315
+ session_tracker.log_session(state, "none")
1316
+
1317
+ explanation = "Désolé, aucune œuvre correspondante n'a pu être trouvée."
1318
+
1319
+ return (
1320
+ gr.update(visible=False), # info_section
1321
+ gr.update(visible=False), # selection_section
1322
+ gr.update(visible=False), # loading_section
1323
+ gr.update(visible=True), # results_section
1324
+ match_image,
1325
+ info.get("title", "Œuvre non trouvée") if match_image else "Œuvre non trouvée",
1326
+ explanation,
1327
+ )
1328
+
1329
+
1330
+ with gr.Blocks(
1331
+ title="Art Matcher",
1332
+ theme=gr.themes.Soft(primary_hue="teal", secondary_hue="teal", neutral_hue="zinc"),
1333
+ ) as demo:
1334
+ gr.Markdown(
1335
+ """
1336
+ # 🎨 Art Matcher
1337
+ ### Découvrez l'œuvre d'art qui vous correspond !
1338
+
1339
+ Cette application utilise vos informations personnelles et vos préférences visuelles
1340
+ pour trouver l'œuvre d'art qui vous correspond le mieux dans notre collection.
1341
+ """
1342
+ )
1343
+
1344
+ session_state = gr.State(SessionState())
1345
+
1346
+ with gr.Group(visible=True) as info_section:
1347
+ gr.Markdown("### Étape 1 : Vos informations")
1348
+ with gr.Row():
1349
+ firstname_input = gr.Textbox(
1350
+ label="Prénom", placeholder="Entrez votre prénom", max_lines=1
1351
+ )
1352
+ birthday_input = gr.Textbox(
1353
+ label="Date d'anniversaire (JJ/MM)",
1354
+ placeholder="Ex: 25/12",
1355
+ max_lines=1,
1356
+ )
1357
+ city_input = gr.Textbox(
1358
+ label="Ville de résidence", placeholder="Ex: Paris", max_lines=1
1359
+ )
1360
+
1361
+ submit_info_btn = gr.Button("Valider mes informations", variant="primary")
1362
+
1363
+ with gr.Group(visible=False) as selection_section:
1364
+ selection_title = gr.Markdown("### Étape 2 : Sélection d'images")
1365
+
1366
+ with gr.Row():
1367
+ img1 = gr.Image(label="Image 1", type="filepath", height=300)
1368
+ img2 = gr.Image(label="Image 2", type="filepath", height=300)
1369
+ img3 = gr.Image(label="Image 3", type="filepath", height=300)
1370
+
1371
+ image_choice = gr.Radio(
1372
+ choices=["Image 1", "Image 2", "Image 3"],
1373
+ label="Quelle image vous attire le plus ?",
1374
+ type="index",
1375
+ )
1376
+
1377
+ select_btn = gr.Button("Valider mon choix", variant="primary")
1378
+
1379
+ with gr.Group(visible=False) as loading_section:
1380
+ gr.Markdown("### ⏳ Analyse en cours...")
1381
+ gr.HTML(
1382
+ """
1383
+ <div style="text-align: center; padding: 40px;">
1384
+ <div style="display: inline-block; width: 60px; height: 60px; border: 6px solid #f3f3f3; border-top: 6px solid #14b8a6; border-radius: 50%; animation: spin 1s linear infinite;"></div>
1385
+ <style>
1386
+ @keyframes spin {
1387
+ 0% { transform: rotate(0deg); }
1388
+ 100% { transform: rotate(360deg); }
1389
+ }
1390
+ </style>
1391
+ <p style="margin-top: 20px; font-size: 18px; color: #666;">
1392
+ <strong>Traitement de vos sélections...</strong><br>
1393
+ <span style="font-size: 14px;">Nous analysons votre profil pour trouver l'œuvre parfaite</span>
1394
+ </p>
1395
+ </div>
1396
+ """
1397
+ )
1398
+
1399
+ with gr.Group(visible=False) as results_section:
1400
+ gr.Markdown("### Votre œuvre correspondante")
1401
+
1402
+ with gr.Row():
1403
+ with gr.Column(scale=1):
1404
+ result_image = gr.Image(label="Votre œuvre", height=400)
1405
+ result_title = gr.Markdown("## Titre de l'œuvre")
1406
+
1407
+ with gr.Column(scale=1):
1408
+ result_explanation = gr.Markdown("")
1409
+
1410
+ restart_btn = gr.Button("Recommencer", variant="secondary")
1411
+
1412
+ status_message = gr.Markdown("")
1413
+
1414
+ def on_info_submit(firstname, birthday, city, state):
1415
+ state.reset()
1416
+
1417
+ info_vis, select_vis, results_vis, message, state = process_user_info(
1418
+ firstname, birthday, city, state
1419
+ )
1420
+
1421
+ if select_vis["visible"]:
1422
+ images, ids, round_message, state = load_images_for_round(0, state)
1423
+ return (
1424
+ info_vis,
1425
+ select_vis,
1426
+ results_vis,
1427
+ images[0] if len(images) > 0 else None,
1428
+ images[1] if len(images) > 1 else None,
1429
+ images[2] if len(images) > 2 else None,
1430
+ round_message,
1431
+ state,
1432
+ )
1433
+ else:
1434
+ return (info_vis, select_vis, results_vis, None, None, None, message, state)
1435
+
1436
+ submit_info_btn.click(
1437
+ fn=on_info_submit,
1438
+ inputs=[firstname_input, birthday_input, city_input, session_state],
1439
+ outputs=[
1440
+ info_section,
1441
+ selection_section,
1442
+ results_section,
1443
+ img1,
1444
+ img2,
1445
+ img3,
1446
+ status_message,
1447
+ session_state,
1448
+ ],
1449
+ )
1450
+
1451
+ def on_image_select(choice, state):
1452
+ result = select_image(choice, state)
1453
+
1454
+ # La fonction select_image retourne maintenant 8 valeurs
1455
+ if len(result) == 8:
1456
+ (
1457
+ img1_update,
1458
+ img2_update,
1459
+ img3_update,
1460
+ choice_update,
1461
+ message,
1462
+ state,
1463
+ selection_vis,
1464
+ loading_vis,
1465
+ ) = result
1466
+ return (
1467
+ gr.update(), # info_section
1468
+ selection_vis, # selection_section
1469
+ loading_vis, # loading_section
1470
+ gr.update(), # results_section
1471
+ img1_update, # img1
1472
+ img2_update, # img2
1473
+ img3_update, # img3
1474
+ choice_update, # image_choice
1475
+ message, # status_message
1476
+ state,
1477
+ )
1478
+ else:
1479
+ # Format avec 6 valeurs (cas sans loading)
1480
+ (img1_update, img2_update, img3_update, choice_update, message, state) = (
1481
+ result
1482
+ )
1483
+ return (
1484
+ gr.update(), # info_section
1485
+ gr.update(), # selection_section
1486
+ gr.update(), # loading_section
1487
+ gr.update(), # results_section
1488
+ img1_update, # img1
1489
+ img2_update, # img2
1490
+ img3_update, # img3
1491
+ choice_update, # image_choice
1492
+ message, # status_message
1493
+ state,
1494
+ )
1495
+
1496
+ def handle_final_results(state):
1497
+ if state.is_complete():
1498
+ return show_results(state)
1499
+ else:
1500
+ return gr.update(), gr.update(), gr.update(), gr.update(), None, "", ""
1501
+
1502
+ select_btn.click(
1503
+ fn=on_image_select,
1504
+ inputs=[image_choice, session_state],
1505
+ outputs=[
1506
+ info_section,
1507
+ selection_section,
1508
+ loading_section,
1509
+ results_section,
1510
+ img1,
1511
+ img2,
1512
+ img3,
1513
+ image_choice,
1514
+ status_message,
1515
+ session_state,
1516
+ ],
1517
+ ).then(
1518
+ fn=handle_final_results,
1519
+ inputs=[session_state],
1520
+ outputs=[
1521
+ info_section,
1522
+ selection_section,
1523
+ loading_section,
1524
+ results_section,
1525
+ result_image,
1526
+ result_title,
1527
+ result_explanation,
1528
+ ],
1529
+ )
1530
+
1531
+ def restart_app(state):
1532
+ state.reset()
1533
+
1534
+ return (
1535
+ gr.update(visible=True), # info_section
1536
+ gr.update(visible=False), # selection_section
1537
+ gr.update(visible=False), # loading_section
1538
+ gr.update(visible=False), # results_section
1539
+ "", # firstname_input
1540
+ "", # birthday_input
1541
+ "", # city_input
1542
+ None, # image_choice
1543
+ "Application réinitialisée. Veuillez entrer vos informations.", # status_message
1544
+ state,
1545
+ )
1546
+
1547
+ restart_btn.click(
1548
+ fn=restart_app,
1549
+ inputs=[session_state],
1550
+ outputs=[
1551
+ info_section,
1552
+ selection_section,
1553
+ loading_section,
1554
+ results_section,
1555
+ firstname_input,
1556
+ birthday_input,
1557
+ city_input,
1558
+ image_choice,
1559
+ status_message,
1560
+ session_state,
1561
+ ],
1562
+ )
1563
+
1564
+
1565
+ if __name__ == "__main__":
1566
+ demo.launch()