Dangindev commited on
Commit
b0ce04d
·
verified ·
1 Parent(s): c15cc4e

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. core/evaluation.py +628 -0
  2. core/post_hoc_explainer.py +418 -0
  3. core/viet_meagent.py +964 -0
core/evaluation.py ADDED
@@ -0,0 +1,628 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ from typing import Dict, List
4
+ import logging
5
+ from rouge_score import rouge_scorer
6
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
7
+ from sentence_transformers import SentenceTransformer
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ import re
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class VietMEAgentEvaluator:
15
+ """Comprehensive evaluation for VietMEAgent - FIXED VERSION"""
16
+
17
+ def __init__(self, cultural_kb_path: str):
18
+ # Load cultural knowledge for evaluation
19
+ with open(cultural_kb_path, 'r', encoding='utf-8') as f:
20
+ self.cultural_kb = json.load(f)
21
+
22
+ # Initialize evaluation tools
23
+ self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
24
+ self.sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
25
+ self.smoothing = SmoothingFunction().method1
26
+
27
+ # Cultural object vocabulary - EXPANDED
28
+ self.cultural_vocabulary = set()
29
+ for obj_name, obj_data in self.cultural_kb['objects'].items():
30
+ self.cultural_vocabulary.add(obj_name.lower())
31
+ # Add variations
32
+ if 'name' in obj_data:
33
+ self.cultural_vocabulary.add(obj_data['name'].lower())
34
+
35
+ # Additional common Vietnamese cultural terms
36
+ additional_terms = [
37
+ 'phở', 'bánh mì', 'áo dài', 'nón lá', 'chùa', 'đình', 'làng', 'thờ',
38
+ 'tết', 'trung thu', 'gỏi cuốn', 'bánh xèo', 'cà phê', 'trúc', 'tre',
39
+ 'đàn bầu', 'trống', 'sáo', 'múa lân', 'rối nước', 'việt nam'
40
+ ]
41
+ self.cultural_vocabulary.update(additional_terms)
42
+
43
+ logger.info(f"Initialized evaluator with {len(self.cultural_vocabulary)} cultural terms")
44
+
45
+ def evaluate_batch(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict:
46
+ """Evaluate a batch of predictions"""
47
+
48
+ logger.info(f"Evaluating {len(predictions)} predictions against {len(ground_truth)} ground truth")
49
+
50
+ results = {
51
+ 'language_quality': {},
52
+ 'cultural_relevance': {},
53
+ 'visual_grounding': {},
54
+ 'overall_performance': {}
55
+ }
56
+
57
+ # Language quality metrics
58
+ results['language_quality'] = self.evaluate_language_quality(predictions, ground_truth)
59
+
60
+ # Cultural relevance metrics
61
+ results['cultural_relevance'] = self.evaluate_cultural_relevance(predictions, ground_truth)
62
+
63
+ # Visual grounding metrics
64
+ results['visual_grounding'] = self.evaluate_visual_grounding(predictions, ground_truth)
65
+
66
+ # Overall performance
67
+ results['overall_performance'] = self.calculate_overall_performance(results)
68
+
69
+ # Debug metrics
70
+ self.debug_evaluation_results(results, predictions, ground_truth)
71
+
72
+ return results
73
+
74
+ def debug_evaluation_results(self, results: Dict, predictions: List[Dict], ground_truth: List[Dict]):
75
+ """Debug evaluation results"""
76
+ logger.info("=== EVALUATION DEBUG ===")
77
+
78
+ # Sample text comparison
79
+ if predictions and ground_truth:
80
+ pred_text = self.extract_text_from_prediction(predictions[0])
81
+ gt_text = self.extract_text_from_ground_truth(ground_truth[0])
82
+ logger.info(f"Sample prediction text: {pred_text[:100]}...")
83
+ logger.info(f"Sample ground truth text: {gt_text[:100]}...")
84
+
85
+ # Cultural objects
86
+ pred_cultural = self.extract_cultural_objects(predictions[0])
87
+ gt_cultural = self.extract_cultural_objects(ground_truth[0])
88
+ logger.info(f"Pred cultural objects: {pred_cultural}")
89
+ logger.info(f"GT cultural objects: {gt_cultural}")
90
+
91
+ logger.info("=== END DEBUG ===")
92
+
93
+ def evaluate_language_quality(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict:
94
+ """Evaluate language quality using BLEU and ROUGE - IMPROVED"""
95
+
96
+ bleu_scores = []
97
+ rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
98
+
99
+ valid_comparisons = 0
100
+
101
+ for pred, gt in zip(predictions, ground_truth):
102
+ # Extract text for comparison - IMPROVED
103
+ pred_text = self.extract_text_from_prediction(pred)
104
+ gt_text = self.extract_text_from_ground_truth(gt)
105
+
106
+ if pred_text and gt_text:
107
+ # Clean and normalize text
108
+ pred_clean = self.clean_vietnamese_text(pred_text)
109
+ gt_clean = self.clean_vietnamese_text(gt_text)
110
+
111
+ if pred_clean and gt_clean:
112
+ valid_comparisons += 1
113
+
114
+ # BLEU score - IMPROVED tokenization
115
+ pred_tokens = self.tokenize_vietnamese(pred_clean)
116
+ gt_tokens = self.tokenize_vietnamese(gt_clean)
117
+
118
+ if pred_tokens and gt_tokens:
119
+ # Use multiple reference for better BLEU
120
+ references = [gt_tokens]
121
+ # Add variations
122
+ if len(gt_tokens) > 3:
123
+ references.append(gt_tokens[:-1]) # Remove last word
124
+ references.append(gt_tokens[1:]) # Remove first word
125
+
126
+ bleu = sentence_bleu(
127
+ references,
128
+ pred_tokens,
129
+ smoothing_function=self.smoothing,
130
+ weights=(0.5, 0.3, 0.2) # Give more weight to unigrams and bigrams
131
+ )
132
+ bleu_scores.append(bleu)
133
+
134
+ # ROUGE scores
135
+ try:
136
+ rouge_result = self.rouge_scorer.score(pred_clean, gt_clean)
137
+ for metric in rouge_scores:
138
+ rouge_scores[metric].append(rouge_result[metric].fmeasure)
139
+ except Exception as e:
140
+ logger.warning(f"ROUGE calculation failed: {e}")
141
+
142
+ logger.info(f"Language quality: {valid_comparisons} valid comparisons out of {len(predictions)}")
143
+
144
+ return {
145
+ 'bleu': np.mean(bleu_scores) if bleu_scores else 0.0,
146
+ 'rouge1': np.mean(rouge_scores['rouge1']) if rouge_scores['rouge1'] else 0.0,
147
+ 'rouge2': np.mean(rouge_scores['rouge2']) if rouge_scores['rouge2'] else 0.0,
148
+ 'rougeL': np.mean(rouge_scores['rougeL']) if rouge_scores['rougeL'] else 0.0,
149
+ 'num_evaluated': valid_comparisons
150
+ }
151
+
152
+ def clean_vietnamese_text(self, text: str) -> str:
153
+ """Clean and normalize Vietnamese text"""
154
+ if not text:
155
+ return ""
156
+
157
+ # Convert to lowercase
158
+ text = text.lower()
159
+
160
+ # Remove extra whitespace
161
+ text = re.sub(r'\s+', ' ', text).strip()
162
+
163
+ # Remove special characters but keep Vietnamese diacritics
164
+ text = re.sub(r'[^\w\sàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text)
165
+
166
+ return text
167
+
168
+ def tokenize_vietnamese(self, text: str) -> List[str]:
169
+ """Tokenize Vietnamese text"""
170
+ if not text:
171
+ return []
172
+
173
+ # Simple word-based tokenization
174
+ tokens = text.split()
175
+
176
+ # Filter out very short tokens
177
+ tokens = [t for t in tokens if len(t) > 1]
178
+
179
+ return tokens
180
+
181
+ def evaluate_cultural_relevance(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict:
182
+ """Evaluate cultural relevance of predictions - IMPROVED"""
183
+
184
+ cultural_precision = []
185
+ cultural_recall = []
186
+ cultural_accuracy = []
187
+ cultural_mentions = []
188
+
189
+ for pred, gt in zip(predictions, ground_truth):
190
+ # Extract cultural objects - IMPROVED
191
+ pred_cultural = self.extract_cultural_objects(pred)
192
+ gt_cultural = self.extract_cultural_objects(gt)
193
+
194
+ # Count cultural mentions in text
195
+ pred_text = self.extract_text_from_prediction(pred)
196
+ gt_text = self.extract_text_from_ground_truth(gt)
197
+
198
+ pred_mentions = self.count_cultural_mentions(pred_text)
199
+ gt_mentions = self.count_cultural_mentions(gt_text)
200
+
201
+ cultural_mentions.append({
202
+ 'pred_mentions': pred_mentions,
203
+ 'gt_mentions': gt_mentions,
204
+ 'mention_overlap': len(set(pred_mentions).intersection(set(gt_mentions)))
205
+ })
206
+
207
+ # If we have ground truth cultural objects
208
+ if gt_cultural or gt_mentions:
209
+ all_gt_cultural = gt_cultural.union(set(gt_mentions))
210
+ all_pred_cultural = pred_cultural.union(set(pred_mentions))
211
+
212
+ if all_pred_cultural:
213
+ precision = len(all_pred_cultural.intersection(all_gt_cultural)) / len(all_pred_cultural)
214
+ cultural_precision.append(precision)
215
+
216
+ if all_gt_cultural:
217
+ recall = len(all_pred_cultural.intersection(all_gt_cultural)) / len(all_gt_cultural)
218
+ cultural_recall.append(recall)
219
+
220
+ # Cultural context accuracy using semantic similarity
221
+ if pred_text and gt_text:
222
+ cultural_acc = self.evaluate_cultural_context_accuracy(pred, gt)
223
+ cultural_accuracy.append(cultural_acc)
224
+
225
+ # Calculate cultural mention accuracy
226
+ mention_accuracy = 0.0
227
+ if cultural_mentions:
228
+ total_overlap = sum(m['mention_overlap'] for m in cultural_mentions)
229
+ total_gt_mentions = sum(len(m['gt_mentions']) for m in cultural_mentions)
230
+ mention_accuracy = total_overlap / total_gt_mentions if total_gt_mentions > 0 else 0.0
231
+
232
+ return {
233
+ 'cultural_precision': np.mean(cultural_precision) if cultural_precision else 0.0,
234
+ 'cultural_recall': np.mean(cultural_recall) if cultural_recall else 0.0,
235
+ 'cultural_accuracy': np.mean(cultural_accuracy) if cultural_accuracy else 0.0,
236
+ 'cultural_mention_accuracy': mention_accuracy,
237
+ 'cultural_f1': self.calculate_f1(
238
+ np.mean(cultural_precision) if cultural_precision else 0.0,
239
+ np.mean(cultural_recall) if cultural_recall else 0.0
240
+ ),
241
+ 'num_cultural_samples': len(cultural_mentions)
242
+ }
243
+
244
+ def count_cultural_mentions(self, text: str) -> List[str]:
245
+ """Count mentions of cultural terms in text"""
246
+ if not text:
247
+ return []
248
+
249
+ text_lower = text.lower()
250
+ mentions = []
251
+
252
+ for cultural_term in self.cultural_vocabulary:
253
+ if cultural_term in text_lower:
254
+ mentions.append(cultural_term)
255
+
256
+ return mentions
257
+
258
+ def evaluate_visual_grounding(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict:
259
+ """Evaluate visual grounding accuracy - IMPROVED"""
260
+
261
+ grounding_scores = []
262
+ detection_accuracy = []
263
+ heatmap_quality = []
264
+
265
+ for pred, gt in zip(predictions, ground_truth):
266
+ # Heatmap-based grounding evaluation
267
+ if 'heatmap' in pred:
268
+ heatmap = np.array(pred['heatmap']) if isinstance(pred['heatmap'], list) else pred['heatmap']
269
+
270
+ # Basic heatmap quality metrics
271
+ if heatmap.size > 0:
272
+ concentration = np.std(heatmap)
273
+ coverage = np.mean(heatmap > 0.3)
274
+ max_attention = np.max(heatmap)
275
+
276
+ # Simple quality score
277
+ quality_score = min(1.0, (concentration * 2 + coverage + max_attention) / 3)
278
+ heatmap_quality.append(quality_score)
279
+
280
+ # If we have ground truth regions, calculate IoU
281
+ if 'attention_regions' in gt:
282
+ iou = self.calculate_grounding_accuracy(heatmap, gt['attention_regions'])
283
+ grounding_scores.append(iou)
284
+ else:
285
+ # Use heatmap quality as proxy for grounding
286
+ grounding_scores.append(quality_score * 0.5) # Lower weight without GT
287
+
288
+ # Object detection accuracy
289
+ pred_objects = []
290
+ if 'image_analysis' in pred and 'cultural_objects' in pred['image_analysis']:
291
+ pred_objects = pred['image_analysis']['cultural_objects']
292
+ elif 'cultural_objects' in pred:
293
+ pred_objects = pred['cultural_objects']
294
+
295
+ gt_objects = []
296
+ if 'image_analysis' in gt and 'cultural_objects' in gt['image_analysis']:
297
+ gt_objects = gt['image_analysis']['cultural_objects']
298
+ elif 'cultural_objects' in gt:
299
+ gt_objects = gt['cultural_objects']
300
+
301
+ if gt_objects or pred_objects:
302
+ detection_acc = self.calculate_detection_accuracy(pred_objects, gt_objects)
303
+ detection_accuracy.append(detection_acc)
304
+
305
+ return {
306
+ 'visual_grounding': np.mean(grounding_scores) if grounding_scores else 0.0,
307
+ 'detection_accuracy': np.mean(detection_accuracy) if detection_accuracy else 0.0,
308
+ 'heatmap_quality': np.mean(heatmap_quality) if heatmap_quality else 0.0,
309
+ 'num_grounding_samples': len(grounding_scores),
310
+ 'num_detection_samples': len(detection_accuracy)
311
+ }
312
+
313
+ def extract_text_from_prediction(self, prediction: Dict) -> str:
314
+ """Extract text from prediction for evaluation - IMPROVED"""
315
+ texts = []
316
+
317
+ # Extract from questions
318
+ if 'questions' in prediction:
319
+ for q in prediction['questions']:
320
+ if 'explanation' in q and q['explanation']:
321
+ texts.append(str(q['explanation']))
322
+ if 'answer' in q and q['answer']:
323
+ texts.append(str(q['answer']))
324
+ if 'question' in q and q['question']:
325
+ texts.append(str(q['question']))
326
+
327
+ # Extract from vietnamese_explanation
328
+ if 'vietnamese_explanation' in prediction and prediction['vietnamese_explanation']:
329
+ texts.append(str(prediction['vietnamese_explanation']))
330
+
331
+ # Extract from image analysis
332
+ if 'image_analysis' in prediction:
333
+ analysis = prediction['image_analysis']
334
+ if 'vietnamese_text' in analysis:
335
+ texts.extend([str(t) for t in analysis['vietnamese_text'] if t])
336
+
337
+ return ' '.join(texts)
338
+
339
+ def extract_text_from_ground_truth(self, ground_truth: Dict) -> str:
340
+ """Extract text from ground truth for evaluation - IMPROVED"""
341
+ texts = []
342
+
343
+ # Extract from questions
344
+ if 'questions' in ground_truth:
345
+ for q in ground_truth['questions']:
346
+ if 'explanation' in q and q['explanation']:
347
+ texts.append(str(q['explanation']))
348
+ if 'answer' in q and q['answer']:
349
+ texts.append(str(q['answer']))
350
+ if 'question' in q and q['question']:
351
+ texts.append(str(q['question']))
352
+
353
+ # Extract from image analysis
354
+ if 'image_analysis' in ground_truth:
355
+ analysis = ground_truth['image_analysis']
356
+ if 'vietnamese_text' in analysis:
357
+ texts.extend([str(t) for t in analysis['vietnamese_text'] if t])
358
+
359
+ return ' '.join(texts)
360
+
361
+ def extract_cultural_objects(self, data: Dict) -> set:
362
+ """Extract cultural objects mentioned in data - IMPROVED"""
363
+ cultural_objects = set()
364
+
365
+ # Get all text from the data
366
+ text = ""
367
+ if 'questions' in data:
368
+ text = self.extract_text_from_prediction(data)
369
+ else:
370
+ text = self.extract_text_from_ground_truth(data)
371
+
372
+ text_lower = text.lower()
373
+
374
+ # Find cultural terms in text
375
+ for cultural_term in self.cultural_vocabulary:
376
+ if cultural_term in text_lower:
377
+ cultural_objects.add(cultural_term)
378
+
379
+ # Also check explicit cultural_objects fields
380
+ if 'cultural_objects' in data:
381
+ for obj in data['cultural_objects']:
382
+ cultural_objects.add(str(obj).lower())
383
+
384
+ if 'image_analysis' in data and 'cultural_objects' in data['image_analysis']:
385
+ for obj in data['image_analysis']['cultural_objects']:
386
+ cultural_objects.add(str(obj).lower())
387
+
388
+ return cultural_objects
389
+
390
+ def evaluate_cultural_context_accuracy(self, prediction: Dict, ground_truth: Dict) -> float:
391
+ """Evaluate accuracy of cultural context understanding - IMPROVED"""
392
+
393
+ # Extract cultural explanations
394
+ pred_text = self.extract_text_from_prediction(prediction)
395
+ gt_text = self.extract_text_from_ground_truth(ground_truth)
396
+
397
+ if not pred_text or not gt_text:
398
+ return 0.0
399
+
400
+ # Clean texts
401
+ pred_clean = self.clean_vietnamese_text(pred_text)
402
+ gt_clean = self.clean_vietnamese_text(gt_text)
403
+
404
+ if not pred_clean or not gt_clean:
405
+ return 0.0
406
+
407
+ try:
408
+ # Use semantic similarity for cultural context evaluation
409
+ pred_embedding = self.sentence_model.encode([pred_clean])
410
+ gt_embedding = self.sentence_model.encode([gt_clean])
411
+
412
+ # Calculate cosine similarity
413
+ similarity = np.dot(pred_embedding[0], gt_embedding[0]) / (
414
+ np.linalg.norm(pred_embedding[0]) * np.linalg.norm(gt_embedding[0])
415
+ )
416
+
417
+ return max(0.0, float(similarity)) # Ensure non-negative
418
+
419
+ except Exception as e:
420
+ logger.warning(f"Cultural context accuracy calculation failed: {e}")
421
+ return 0.0
422
+
423
+ def calculate_grounding_accuracy(self, pred_heatmap: np.ndarray, gt_regions: List) -> float:
424
+ """Calculate visual grounding accuracy"""
425
+ if len(gt_regions) == 0 or pred_heatmap.size == 0:
426
+ return 0.0
427
+
428
+ try:
429
+ # Ensure heatmap is 2D
430
+ if pred_heatmap.ndim > 2:
431
+ pred_heatmap = pred_heatmap.reshape(-1, pred_heatmap.shape[-1])
432
+
433
+ # Create ground truth mask
434
+ gt_mask = np.zeros_like(pred_heatmap)
435
+ for region in gt_regions:
436
+ if isinstance(region, (list, tuple)) and len(region) >= 4:
437
+ x, y, w, h = region[:4]
438
+ x, y, w, h = int(x), int(y), int(w), int(h)
439
+
440
+ # Ensure bounds
441
+ x = max(0, min(x, gt_mask.shape[1] - 1))
442
+ y = max(0, min(y, gt_mask.shape[0] - 1))
443
+ w = max(1, min(w, gt_mask.shape[1] - x))
444
+ h = max(1, min(h, gt_mask.shape[0] - y))
445
+
446
+ gt_mask[y:y+h, x:x+w] = 1
447
+
448
+ # Threshold prediction heatmap
449
+ pred_mask = (pred_heatmap > 0.5).astype(np.float32)
450
+
451
+ # Calculate IoU
452
+ intersection = np.logical_and(pred_mask, gt_mask).sum()
453
+ union = np.logical_or(pred_mask, gt_mask).sum()
454
+
455
+ return float(intersection / union) if union > 0 else 0.0
456
+
457
+ except Exception as e:
458
+ logger.warning(f"Grounding accuracy calculation failed: {e}")
459
+ return 0.0
460
+
461
+ def calculate_detection_accuracy(self, pred_objects: List, gt_objects: List) -> float:
462
+ """Calculate object detection accuracy - IMPROVED"""
463
+ if not gt_objects and not pred_objects:
464
+ return 1.0
465
+
466
+ if not gt_objects:
467
+ return 0.0 if pred_objects else 1.0
468
+
469
+ # Convert to lowercase and clean
470
+ pred_set = set(str(obj).lower().strip() for obj in pred_objects if obj)
471
+ gt_set = set(str(obj).lower().strip() for obj in gt_objects if obj)
472
+
473
+ if not gt_set:
474
+ return 1.0 if not pred_set else 0.0
475
+
476
+ # Calculate Jaccard similarity (IoU for sets)
477
+ intersection = len(pred_set.intersection(gt_set))
478
+ union = len(pred_set.union(gt_set))
479
+
480
+ return intersection / union if union > 0 else 0.0
481
+
482
+ def calculate_f1(self, precision: float, recall: float) -> float:
483
+ """Calculate F1 score"""
484
+ if precision + recall == 0:
485
+ return 0.0
486
+ return 2 * (precision * recall) / (precision + recall)
487
+
488
+ def calculate_overall_performance(self, results: Dict) -> Dict:
489
+ """Calculate overall performance metrics - IMPROVED"""
490
+
491
+ # Weight different aspects
492
+ weights = {
493
+ 'language_quality': 0.4, # Increased weight
494
+ 'cultural_relevance': 0.4, # Increased weight
495
+ 'visual_grounding': 0.2 # Decreased weight (often no GT data)
496
+ }
497
+
498
+ # Calculate weighted average using multiple metrics
499
+ overall_score = 0.0
500
+ component_scores = {}
501
+
502
+ for aspect, weight in weights.items():
503
+ if aspect in results:
504
+ if aspect == 'language_quality':
505
+ # Average of ROUGE-L and BLEU (ROUGE usually more reliable for Vietnamese)
506
+ rouge_l = results[aspect].get('rougeL', 0.0)
507
+ bleu = results[aspect].get('bleu', 0.0)
508
+ score = (rouge_l * 0.7 + bleu * 0.3) # Weight ROUGE-L higher
509
+ elif aspect == 'cultural_relevance':
510
+ # Average of multiple cultural metrics
511
+ cult_acc = results[aspect].get('cultural_accuracy', 0.0)
512
+ cult_f1 = results[aspect].get('cultural_f1', 0.0)
513
+ mention_acc = results[aspect].get('cultural_mention_accuracy', 0.0)
514
+ score = (cult_acc * 0.4 + cult_f1 * 0.3 + mention_acc * 0.3)
515
+ elif aspect == 'visual_grounding':
516
+ # Average of grounding metrics
517
+ grounding = results[aspect].get('visual_grounding', 0.0)
518
+ detection = results[aspect].get('detection_accuracy', 0.0)
519
+ heatmap_q = results[aspect].get('heatmap_quality', 0.0)
520
+ score = (grounding * 0.4 + detection * 0.4 + heatmap_q * 0.2)
521
+
522
+ component_scores[aspect] = score
523
+ overall_score += weight * score
524
+
525
+ return {
526
+ 'overall_score': overall_score,
527
+ 'component_scores': component_scores,
528
+ 'weights': weights
529
+ }
530
+
531
+ def generate_evaluation_report(self, results: Dict, save_path: str = None) -> str:
532
+ """Generate comprehensive evaluation report - IMPROVED"""
533
+
534
+ report = f"""
535
+ VietMEAgent Evaluation Report
536
+ {'='*50}
537
+
538
+ Language Quality:
539
+ BLEU Score: {results['language_quality']['bleu']:.4f}
540
+ ROUGE-1: {results['language_quality']['rouge1']:.4f}
541
+ ROUGE-2: {results['language_quality']['rouge2']:.4f}
542
+ ROUGE-L: {results['language_quality']['rougeL']:.4f}
543
+ Samples Evaluated: {results['language_quality']['num_evaluated']}
544
+
545
+ Cultural Relevance:
546
+ Cultural Precision: {results['cultural_relevance']['cultural_precision']:.4f}
547
+ Cultural Recall: {results['cultural_relevance']['cultural_recall']:.4f}
548
+ Cultural F1: {results['cultural_relevance']['cultural_f1']:.4f}
549
+ Cultural Accuracy: {results['cultural_relevance']['cultural_accuracy']:.4f}
550
+ Cultural Mention Accuracy: {results['cultural_relevance']['cultural_mention_accuracy']:.4f}
551
+ Cultural Samples: {results['cultural_relevance']['num_cultural_samples']}
552
+
553
+ Visual Grounding:
554
+ Grounding Accuracy: {results['visual_grounding']['visual_grounding']:.4f}
555
+ Detection Accuracy: {results['visual_grounding']['detection_accuracy']:.4f}
556
+ Heatmap Quality: {results['visual_grounding']['heatmap_quality']:.4f}
557
+ Grounding Samples: {results['visual_grounding']['num_grounding_samples']}
558
+ Detection Samples: {results['visual_grounding']['num_detection_samples']}
559
+
560
+ Overall Performance:
561
+ Overall Score: {results['overall_performance']['overall_score']:.4f}
562
+ Component Scores: {results['overall_performance']['component_scores']}
563
+
564
+ {'='*50}
565
+ """
566
+
567
+ if save_path:
568
+ with open(save_path, 'w', encoding='utf-8') as f:
569
+ f.write(report)
570
+ logger.info(f"Evaluation report saved to {save_path}")
571
+
572
+ return report
573
+
574
+ def plot_evaluation_results(self, results: Dict, save_path: str = None):
575
+ """Plot evaluation results - IMPROVED"""
576
+
577
+ # Create subplots
578
+ fig, axes = plt.subplots(2, 2, figsize=(15, 10))
579
+
580
+ # Language Quality
581
+ lang_metrics = ['bleu', 'rouge1', 'rouge2', 'rougeL']
582
+ lang_scores = [results['language_quality'][m] for m in lang_metrics]
583
+
584
+ axes[0, 0].bar(lang_metrics, lang_scores, color='skyblue')
585
+ axes[0, 0].set_title('Language Quality Metrics')
586
+ axes[0, 0].set_ylim(0, 1)
587
+ axes[0, 0].tick_params(axis='x', rotation=45)
588
+
589
+ # Cultural Relevance
590
+ cult_metrics = ['cultural_precision', 'cultural_recall', 'cultural_f1', 'cultural_accuracy']
591
+ cult_scores = [results['cultural_relevance'][m] for m in cult_metrics]
592
+
593
+ axes[0, 1].bar(cult_metrics, cult_scores, color='lightcoral')
594
+ axes[0, 1].set_title('Cultural Relevance Metrics')
595
+ axes[0, 1].set_ylim(0, 1)
596
+ axes[0, 1].tick_params(axis='x', rotation=45)
597
+
598
+ # Visual Grounding
599
+ visual_metrics = ['visual_grounding', 'detection_accuracy', 'heatmap_quality']
600
+ visual_scores = [results['visual_grounding'][m] for m in visual_metrics]
601
+
602
+ axes[1, 0].bar(visual_metrics, visual_scores, color='lightgreen')
603
+ axes[1, 0].set_title('Visual Grounding Metrics')
604
+ axes[1, 0].set_ylim(0, 1)
605
+ axes[1, 0].tick_params(axis='x', rotation=45)
606
+
607
+ # Overall comparison
608
+ overall_metrics = ['Language Quality', 'Cultural Relevance', 'Visual Grounding']
609
+ component_scores = results['overall_performance']['component_scores']
610
+ overall_scores = [
611
+ component_scores.get('language_quality', 0),
612
+ component_scores.get('cultural_relevance', 0),
613
+ component_scores.get('visual_grounding', 0)
614
+ ]
615
+
616
+ axes[1, 1].bar(overall_metrics, overall_scores, color='gold')
617
+ axes[1, 1].set_title('Overall Performance Comparison')
618
+ axes[1, 1].set_ylim(0, 1)
619
+ axes[1, 1].tick_params(axis='x', rotation=45)
620
+
621
+ plt.tight_layout()
622
+
623
+ if save_path:
624
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
625
+ logger.info(f"Evaluation plots saved to {save_path}")
626
+
627
+ plt.show()
628
+ return fig
core/post_hoc_explainer.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import cv2
4
+ import numpy as np
5
+ from PIL import Image
6
+ import matplotlib.pyplot as plt
7
+ from transformers import CLIPProcessor, CLIPModel
8
+ import logging
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class PostHocExplainer:
13
+ """
14
+ Post-hoc explanation module for generating visual explanations
15
+ Implements heatmaps to show which image regions influenced the answer
16
+ """
17
+
18
+ def __init__(self, clip_model, clip_processor=None, device='cuda'):
19
+ self.clip_model = clip_model
20
+ self.clip_processor = clip_processor
21
+ self.device = device
22
+
23
+ # Validate inputs
24
+ if self.clip_model is None:
25
+ raise ValueError("CLIP model cannot be None")
26
+
27
+ if self.clip_processor is None:
28
+ logger.warning("CLIP processor is None, some methods may not work")
29
+
30
+ # Set model to evaluation mode
31
+ self.clip_model.eval()
32
+
33
+ logger.info("PostHocExplainer initialized with CLIP model")
34
+
35
+ def generate_heatmap(self, image, question_text=None, method='attention_rollout'):
36
+ """Generate heatmap showing important image regions for VQA"""
37
+ logger.info(f"Generating heatmap using method: {method}")
38
+
39
+ try:
40
+ if method == 'attention_rollout':
41
+ return self.generate_attention_rollout_heatmap(image, question_text)
42
+ elif method == 'gradient_based':
43
+ return self.generate_gradient_heatmap(image, question_text)
44
+ elif method == 'occlusion':
45
+ return self.generate_occlusion_heatmap(image, question_text)
46
+ else:
47
+ logger.warning(f"Unknown method {method}, using attention_rollout")
48
+ return self.generate_attention_rollout_heatmap(image, question_text)
49
+
50
+ except Exception as e:
51
+ logger.error(f"Heatmap generation failed: {e}")
52
+ logger.info("Using fallback center-focused heatmap")
53
+ return self.create_center_fallback_heatmap()
54
+
55
+ def generate_attention_rollout_heatmap(self, image, question_text=None):
56
+ """Generate heatmap using attention rollout method"""
57
+ logger.info("Generating attention rollout heatmap")
58
+
59
+ try:
60
+ # Check if processor is available
61
+ if self.clip_processor is None:
62
+ raise ValueError("CLIP processor is required for attention rollout")
63
+
64
+ # Prepare inputs
65
+ if question_text is None:
66
+ question_text = "What is in this image?"
67
+
68
+ # Process image and text with truncation
69
+ inputs = self.clip_processor(
70
+ text=[question_text],
71
+ images=image,
72
+ return_tensors="pt",
73
+ padding=True,
74
+ truncation=True,
75
+ max_length=77 # CLIP's maximum token length
76
+ ).to(self.device)
77
+
78
+ logger.info("Running forward pass with attention outputs")
79
+
80
+ # Get attention weights
81
+ with torch.no_grad():
82
+ outputs = self.clip_model(**inputs, output_attentions=True)
83
+
84
+ # Try different ways to access vision attention
85
+ vision_attentions = None
86
+
87
+ # Method 1: Direct access
88
+ if hasattr(outputs, 'vision_model_output') and outputs.vision_model_output is not None:
89
+ if hasattr(outputs.vision_model_output, 'attentions'):
90
+ vision_attentions = outputs.vision_model_output.attentions
91
+ logger.info("Found vision attentions via vision_model_output")
92
+
93
+ # Method 2: Check if attentions are in main output
94
+ if vision_attentions is None and hasattr(outputs, 'attentions'):
95
+ vision_attentions = outputs.attentions
96
+ logger.info("Found attentions in main output")
97
+
98
+ # If still no attention, create fallback
99
+ if vision_attentions is None or len(vision_attentions) == 0:
100
+ logger.warning("No attention weights found, creating uniform attention")
101
+ attention_2d = torch.ones(7, 7) / 49
102
+ else:
103
+ # Extract attention from last layer
104
+ last_attention = vision_attentions[-1] # Last layer
105
+
106
+ # Average across heads and batch
107
+ attention_map = last_attention.mean(dim=1)[0] # [seq_len, seq_len]
108
+
109
+ # Get spatial attention (excluding CLS token)
110
+ spatial_attention = attention_map[1:, 1:] # Remove CLS token
111
+
112
+ # Reshape to spatial dimensions
113
+ patch_size = int(np.sqrt(spatial_attention.shape[0]))
114
+ if spatial_attention.shape[0] == patch_size * patch_size:
115
+ attention_2d = spatial_attention.mean(dim=1).reshape(patch_size, patch_size)
116
+ logger.info(f"Reshaped attention to {patch_size}x{patch_size}")
117
+ else:
118
+ logger.warning(f"Cannot reshape attention {spatial_attention.shape}, using uniform")
119
+ attention_2d = torch.ones(7, 7) / 49
120
+
121
+ # Resize to 224x224
122
+ attention_2d = F.interpolate(
123
+ attention_2d.unsqueeze(0).unsqueeze(0),
124
+ size=(224, 224),
125
+ mode='bilinear',
126
+ align_corners=False
127
+ ).squeeze().cpu().numpy()
128
+
129
+ # Normalize to [0, 1]
130
+ attention_2d = (attention_2d - attention_2d.min()) / (attention_2d.max() - attention_2d.min() + 1e-8)
131
+
132
+ logger.info(f"Generated attention heatmap with shape {attention_2d.shape}")
133
+ return attention_2d
134
+
135
+ except Exception as e:
136
+ logger.warning(f"Attention rollout failed: {e}, using gradient method")
137
+ return self.generate_gradient_heatmap(image, question_text)
138
+
139
+ def generate_gradient_heatmap(self, image, question_text=None):
140
+ """Generate heatmap using gradient-based method"""
141
+ logger.info("Generating gradient-based heatmap")
142
+
143
+ try:
144
+ if self.clip_processor is None:
145
+ raise ValueError("CLIP processor is required for gradient method")
146
+
147
+ if question_text is None:
148
+ question_text = "What is in this image?"
149
+
150
+ # Enable gradient computation
151
+ self.clip_model.train()
152
+
153
+ # Process inputs with truncation
154
+ inputs = self.clip_processor(
155
+ text=[question_text],
156
+ images=image,
157
+ return_tensors="pt",
158
+ padding=True,
159
+ truncation=True,
160
+ max_length=77 # CLIP's maximum token length
161
+ ).to(self.device)
162
+
163
+ # Require gradients for pixel values
164
+ inputs['pixel_values'].requires_grad_(True)
165
+
166
+ logger.info("Running forward pass for gradients")
167
+
168
+ # Forward pass
169
+ outputs = self.clip_model(**inputs)
170
+
171
+ # Get image-text similarity score
172
+ logits_per_image = outputs.logits_per_image[0, 0]
173
+
174
+ logger.info("Computing gradients")
175
+
176
+ # Backward pass
177
+ logits_per_image.backward()
178
+
179
+ # Get gradients
180
+ gradients = inputs['pixel_values'].grad[0] # [C, H, W]
181
+
182
+ # Create heatmap from gradients
183
+ heatmap = torch.norm(gradients, dim=0).cpu().numpy() # [H, W]
184
+
185
+ # Normalize
186
+ heatmap = (heatmap - heatmap.min()) / (heatmap.max() - heatmap.min() + 1e-8)
187
+
188
+ # Reset model to eval mode
189
+ self.clip_model.eval()
190
+
191
+ logger.info(f"Generated gradient heatmap with shape {heatmap.shape}")
192
+ return heatmap
193
+
194
+ except Exception as e:
195
+ logger.warning(f"Gradient method failed: {e}, using occlusion method")
196
+ return self.generate_occlusion_heatmap(image, question_text)
197
+
198
+ def generate_occlusion_heatmap(self, image, question_text=None, patch_size=32):
199
+ """Generate heatmap using occlusion method"""
200
+ logger.info("Generating occlusion-based heatmap")
201
+
202
+ try:
203
+ if self.clip_processor is None:
204
+ raise ValueError("CLIP processor is required for occlusion method")
205
+
206
+ if question_text is None:
207
+ question_text = "What is in this image?"
208
+
209
+ # Convert to numpy for processing
210
+ if isinstance(image, Image.Image):
211
+ image_np = np.array(image)
212
+ else:
213
+ image_np = image
214
+
215
+ # Resize to standard size
216
+ image_resized = cv2.resize(image_np, (224, 224))
217
+ image_pil = Image.fromarray(image_resized)
218
+
219
+ logger.info("Getting baseline score")
220
+
221
+ # Get baseline score
222
+ inputs_baseline = self.clip_processor(
223
+ text=[question_text],
224
+ images=image_pil,
225
+ return_tensors="pt",
226
+ padding=True,
227
+ truncation=True,
228
+ max_length=77 # CLIP's maximum token length
229
+ ).to(self.device)
230
+
231
+ with torch.no_grad():
232
+ baseline_output = self.clip_model(**inputs_baseline)
233
+ baseline_score = baseline_output.logits_per_image[0, 0].cpu().item()
234
+
235
+ logger.info(f"Baseline score: {baseline_score}")
236
+
237
+ # Create heatmap
238
+ heatmap = np.zeros((224, 224))
239
+
240
+ # Occlude different regions
241
+ num_patches = 224 // patch_size
242
+ logger.info(f"Testing {num_patches}x{num_patches} patches")
243
+
244
+ for y in range(0, 224, patch_size):
245
+ for x in range(0, 224, patch_size):
246
+ try:
247
+ # Create occluded image
248
+ occluded_image = image_resized.copy()
249
+ y_end = min(y + patch_size, 224)
250
+ x_end = min(x + patch_size, 224)
251
+ occluded_image[y:y_end, x:x_end] = 128 # Gray patch
252
+
253
+ # Get score with occlusion
254
+ occluded_pil = Image.fromarray(occluded_image)
255
+ inputs_occluded = self.clip_processor(
256
+ text=[question_text],
257
+ images=occluded_pil,
258
+ return_tensors="pt",
259
+ padding=True,
260
+ truncation=True,
261
+ max_length=77 # CLIP's maximum token length
262
+ ).to(self.device)
263
+
264
+ with torch.no_grad():
265
+ occluded_output = self.clip_model(**inputs_occluded)
266
+ occluded_score = occluded_output.logits_per_image[0, 0].cpu().item()
267
+
268
+ # Importance = baseline - occluded (higher drop = more important)
269
+ importance = baseline_score - occluded_score
270
+ heatmap[y:y_end, x:x_end] = importance
271
+
272
+ except Exception as e:
273
+ logger.warning(f"Occlusion patch ({x},{y}) failed: {e}")
274
+ continue
275
+
276
+ # Normalize heatmap
277
+ heatmap = np.maximum(heatmap, 0) # Keep only positive values
278
+ if heatmap.max() > 0:
279
+ heatmap = heatmap / heatmap.max()
280
+
281
+ logger.info(f"Generated occlusion heatmap with shape {heatmap.shape}")
282
+ return heatmap
283
+
284
+ except Exception as e:
285
+ logger.error(f"Occlusion method failed: {e}")
286
+ return self.create_center_fallback_heatmap()
287
+
288
+ def create_center_fallback_heatmap(self):
289
+ """Create a center-focused fallback heatmap"""
290
+ logger.info("Creating fallback center-focused heatmap")
291
+
292
+ heatmap = np.zeros((224, 224))
293
+ center_y, center_x = 112, 112
294
+
295
+ for y in range(224):
296
+ for x in range(224):
297
+ distance = np.sqrt((y - center_y)**2 + (x - center_x)**2)
298
+ heatmap[y, x] = max(0, 1 - distance / 112)
299
+
300
+ return heatmap
301
+
302
+ def visualize_explanation(self, image, heatmap, title="VQA Explanation", save_path=None):
303
+ """Visualize heatmap overlay on original image"""
304
+ try:
305
+ # Prepare original image
306
+ if isinstance(image, Image.Image):
307
+ image_np = np.array(image)
308
+ else:
309
+ image_np = image
310
+
311
+ # Resize image to match heatmap
312
+ image_resized = cv2.resize(image_np, (heatmap.shape[1], heatmap.shape[0]))
313
+ image_resized = image_resized.astype(np.float32) / 255.0
314
+
315
+ # Create visualization
316
+ plt.figure(figsize=(15, 5))
317
+
318
+ # Original image
319
+ plt.subplot(1, 3, 1)
320
+ plt.imshow(image_resized)
321
+ plt.title("Original Image")
322
+ plt.axis('off')
323
+
324
+ # Heatmap
325
+ plt.subplot(1, 3, 2)
326
+ plt.imshow(heatmap, cmap='hot', interpolation='bilinear')
327
+ plt.title("Attention Heatmap")
328
+ plt.axis('off')
329
+ plt.colorbar()
330
+
331
+ # Overlay
332
+ plt.subplot(1, 3, 3)
333
+ plt.imshow(image_resized)
334
+ plt.imshow(heatmap, cmap='hot', alpha=0.6, interpolation='bilinear')
335
+ plt.title(title)
336
+ plt.axis('off')
337
+
338
+ plt.tight_layout()
339
+
340
+ if save_path:
341
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
342
+ logger.info(f"Visualization saved to {save_path}")
343
+
344
+ plt.close() # Close to prevent display in headless environment
345
+
346
+ return image_resized
347
+
348
+ except Exception as e:
349
+ logger.error(f"Visualization failed: {e}")
350
+ return None
351
+
352
+
353
+ class VietnameseExplanationGenerator:
354
+ """Generate Vietnamese explanations for VQA results"""
355
+
356
+ def __init__(self, cultural_kb):
357
+ self.cultural_kb = cultural_kb
358
+
359
+ # Vietnamese explanation templates
360
+ self.templates = {
361
+ 'food': "Trong ảnh có {object}, đây là {description}. {cultural_significance}",
362
+ 'clothing': "Trang phục {object} trong ảnh thể hiện {cultural_significance}",
363
+ 'architecture': "Kiến trúc {object} mang đặc trưng {description}",
364
+ 'activity': "Hoạt động {object} có ý nghĩa {cultural_significance}",
365
+ 'general': "Đối tượng {object} trong văn hóa Việt Nam {description}"
366
+ }
367
+
368
+ def generate_explanation(self, question, answer, cultural_objects, heatmap=None):
369
+ """Generate Vietnamese cultural explanation"""
370
+ try:
371
+ explanations = []
372
+
373
+ # Base explanation
374
+ base_explanation = f"Câu trả lời '{answer}' được đưa ra dựa trên phân tích hình ảnh."
375
+ explanations.append(base_explanation)
376
+
377
+ # Cultural explanations
378
+ for obj in cultural_objects:
379
+ if obj in self.cultural_kb['objects']:
380
+ obj_data = self.cultural_kb['objects'][obj]
381
+ category = obj_data.get('category', 'general')
382
+ template = self.templates.get(category, self.templates['general'])
383
+
384
+ cultural_exp = template.format(
385
+ object=obj,
386
+ description=obj_data.get('description', ''),
387
+ cultural_significance=obj_data.get('cultural_significance', '')
388
+ )
389
+ explanations.append(cultural_exp)
390
+
391
+ # Visual attention explanation
392
+ if heatmap is not None:
393
+ attention_exp = self.generate_attention_explanation(heatmap)
394
+ explanations.append(attention_exp)
395
+
396
+ return " ".join(explanations)
397
+
398
+ except Exception as e:
399
+ logger.warning(f"Explanation generation failed: {e}")
400
+ return f"Phân tích hình ảnh cho câu hỏi: {question}"
401
+
402
+ def generate_attention_explanation(self, heatmap):
403
+ """Generate explanation about visual attention"""
404
+ try:
405
+ # Calculate attention statistics
406
+ max_attention = np.max(heatmap)
407
+ mean_attention = np.mean(heatmap)
408
+
409
+ if max_attention > 0.8:
410
+ return "Mô hình tập trung cao độ vào một vùng cụ thể trong ảnh."
411
+ elif mean_attention > 0.5:
412
+ return "Mô hình phân tán sự chú ý trên nhiều vùng khác nhau."
413
+ else:
414
+ return "Mô hình có sự chú ý tương đối đều trên toàn bộ ảnh."
415
+
416
+ except Exception as e:
417
+ logger.warning(f"Attention explanation failed: {e}")
418
+ return "Phân tích sự chú ý của mô hình."
core/viet_meagent.py ADDED
@@ -0,0 +1,964 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import json
4
+ import cv2
5
+ import numpy as np
6
+ from PIL import Image
7
+ import google.generativeai as genai
8
+ from typing import Dict, List, Tuple, Optional
9
+ import logging
10
+ from transformers import CLIPProcessor, CLIPModel
11
+ import easyocr
12
+ from sentence_transformers import SentenceTransformer
13
+ import faiss
14
+ import os
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class VietMEAgent:
19
+ """
20
+ VietMEAgent: Culturally-Aware Few-Shot Multimodal Explanation
21
+ for Vietnamese Visual Question Answering - FIXED CULTURAL DETECTION
22
+ """
23
+
24
+ def __init__(self, config_path: str = "configs/vietmeagent_config.json"):
25
+ self.config = self.load_config(config_path)
26
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+
28
+ # Initialize components
29
+ self.setup_models()
30
+ self.load_cultural_knowledge()
31
+ self.setup_few_shot_examples()
32
+
33
+ logger.info(f"VietMEAgent initialized on {self.device}")
34
+
35
+ def load_config(self, config_path: str) -> Dict:
36
+ """Load VietMEAgent configuration"""
37
+ try:
38
+ with open(config_path, 'r', encoding='utf-8') as f:
39
+ config = json.load(f)
40
+
41
+ # Flatten nested config for backward compatibility
42
+ flat_config = {}
43
+
44
+ # Extract model_config keys to top level
45
+ if 'model_config' in config:
46
+ flat_config.update(config['model_config'])
47
+
48
+ # Add other sections
49
+ for section, values in config.items():
50
+ if section != 'model_config' and isinstance(values, dict):
51
+ flat_config[section] = values
52
+ elif section != 'model_config':
53
+ flat_config[section] = values
54
+
55
+ # Override with environment variables if available
56
+ if os.getenv('GEMINI_API_KEY'):
57
+ flat_config['gemini_api_key'] = os.getenv('GEMINI_API_KEY')
58
+
59
+ if os.getenv('CULTURAL_THRESHOLD'):
60
+ flat_config['cultural_threshold'] = float(os.getenv('CULTURAL_THRESHOLD'))
61
+
62
+ if os.getenv('MAX_FEW_SHOT_EXAMPLES'):
63
+ flat_config['max_few_shot_examples'] = int(os.getenv('MAX_FEW_SHOT_EXAMPLES'))
64
+
65
+ return flat_config
66
+
67
+ except FileNotFoundError:
68
+ # Default config if file not found - use environment variables first
69
+ default_config = {
70
+ "gemini_api_key": os.getenv('GEMINI_API_KEY', "AIzaSyCgatP7izHkaBn6im8AfXq0Ufmb0Fr-7dc"),
71
+ "max_few_shot_examples": int(os.getenv('MAX_FEW_SHOT_EXAMPLES', 16)),
72
+ "cultural_threshold": float(os.getenv('CULTURAL_THRESHOLD', 0.15)),
73
+ "explanation_max_length": 200,
74
+ "heatmap_resolution": (224, 224),
75
+ "paths": {
76
+ "cultural_kb": os.getenv('CULTURAL_KB_PATH', "data/cultural_kb/vietnamese_cultural_knowledge.json"),
77
+ "vqa_dataset": os.getenv('VQA_DATASET_PATH', "data/annotations/vietnamese_vqa_dataset.json"),
78
+ "output_dir": os.getenv('OUTPUT_DIR', "results")
79
+ }
80
+ }
81
+ return default_config
82
+
83
+ def setup_models(self):
84
+ """Initialize all required models"""
85
+ logger.info("Setting up models...")
86
+
87
+ # 1. Gemini for LLM reasoning
88
+ genai.configure(api_key=self.config["gemini_api_key"])
89
+ self.llm_model = genai.GenerativeModel('gemini-1.5-flash')
90
+
91
+ # 2. CLIP for vision-language understanding
92
+ logger.info("Loading CLIP model...")
93
+ self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
94
+ self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
95
+
96
+ # 3. Vietnamese OCR
97
+ logger.info("Setting up Vietnamese OCR...")
98
+ self.ocr_reader = easyocr.Reader(['vi', 'en'], gpu=torch.cuda.is_available())
99
+
100
+ # 4. Sentence encoder for cultural similarity
101
+ logger.info("Loading sentence encoder...")
102
+ self.sentence_encoder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
103
+
104
+ # 5. Cultural object detector (using CLIP for now)
105
+ logger.info("Setting up cultural object detector...")
106
+ self.cultural_detector = CulturalObjectDetector(
107
+ self.clip_model, self.clip_processor, self.device
108
+ )
109
+
110
+ logger.info("All models loaded successfully!")
111
+
112
+ def load_cultural_knowledge(self):
113
+ """Load Vietnamese cultural knowledge base"""
114
+ kb_path = self.config["paths"]["cultural_kb"]
115
+ with open(kb_path, 'r', encoding='utf-8') as f:
116
+ self.cultural_kb = json.load(f)
117
+
118
+ # Create cultural embeddings for fast retrieval
119
+ self.create_cultural_embeddings()
120
+ logger.info(f"Cultural KB loaded with {len(self.cultural_kb['objects'])} objects")
121
+
122
+ def create_cultural_embeddings(self):
123
+ """Create embeddings for cultural objects for fast similarity search"""
124
+ cultural_texts = []
125
+ self.cultural_objects = []
126
+
127
+ for obj_name, obj_data in self.cultural_kb['objects'].items():
128
+ text = f"{obj_name} {obj_data['description']} {obj_data['cultural_significance']}"
129
+ cultural_texts.append(text)
130
+ self.cultural_objects.append(obj_name)
131
+
132
+ # Create embeddings
133
+ embeddings = self.sentence_encoder.encode(cultural_texts)
134
+
135
+ # Build FAISS index for fast retrieval
136
+ self.cultural_index = faiss.IndexFlatIP(embeddings.shape[1])
137
+ self.cultural_index.add(embeddings.astype('float32'))
138
+
139
+ logger.info("Cultural embeddings created")
140
+
141
+ def setup_few_shot_examples(self):
142
+ """Load few-shot examples from VQA dataset"""
143
+ vqa_path = self.config["paths"]["vqa_dataset"]
144
+ with open(vqa_path, 'r', encoding='utf-8') as f:
145
+ vqa_data = json.load(f)
146
+
147
+ # Select diverse examples across categories
148
+ self.few_shot_examples = self.select_diverse_examples(
149
+ vqa_data, k=self.config["max_few_shot_examples"]
150
+ )
151
+ logger.info(f"Selected {len(self.few_shot_examples)} few-shot examples")
152
+
153
+ def select_diverse_examples(self, vqa_data: List[Dict], k: int = 16) -> List[Dict]:
154
+ """Select diverse examples across categories for few-shot learning"""
155
+ examples_by_category = {}
156
+
157
+ for item in vqa_data:
158
+ category = item.get('category', 'unknown')
159
+ if category not in examples_by_category:
160
+ examples_by_category[category] = []
161
+ examples_by_category[category].append(item)
162
+
163
+ # Select examples from each category
164
+ selected_examples = []
165
+ examples_per_category = max(1, k // len(examples_by_category))
166
+
167
+ for category, examples in examples_by_category.items():
168
+ # Sort by quality (number of questions) and select best
169
+ examples.sort(key=lambda x: len(x.get('questions', [])), reverse=True)
170
+ selected_examples.extend(examples[:examples_per_category])
171
+
172
+ return selected_examples[:k]
173
+
174
+ def process_image(self, image_path: str) -> Dict:
175
+ """Process image through complete VietMEAgent pipeline"""
176
+ logger.info(f"Processing image: {image_path}")
177
+
178
+ # Load image
179
+ if isinstance(image_path, str):
180
+ image = Image.open(image_path).convert('RGB')
181
+ else:
182
+ # Handle numpy array input
183
+ image = Image.fromarray((image_path * 255).astype(np.uint8)).convert('RGB')
184
+
185
+ # 1. Extract Vietnamese text
186
+ vietnamese_text = self.extract_vietnamese_text(image)
187
+
188
+ # 2. Detect cultural objects - IMPROVED
189
+ cultural_objects = self.cultural_detector.detect_objects(image)
190
+ logger.info(f"Detected cultural objects: {cultural_objects}")
191
+
192
+ # 3. Retrieve cultural context
193
+ cultural_context = self.retrieve_cultural_context(cultural_objects + vietnamese_text)
194
+
195
+ # 4. Generate program and explanation
196
+ result = {
197
+ "image_path": image_path if isinstance(image_path, str) else "processed_array",
198
+ "vietnamese_text": vietnamese_text,
199
+ "cultural_objects": cultural_objects,
200
+ "cultural_context": cultural_context,
201
+ "processed_successfully": True
202
+ }
203
+
204
+ return result
205
+
206
+ def extract_vietnamese_text(self, image: Image.Image) -> List[str]:
207
+ """Extract Vietnamese text from image using OCR"""
208
+ try:
209
+ # Convert PIL to cv2 format
210
+ img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
211
+
212
+ # Run OCR
213
+ results = self.ocr_reader.readtext(img_cv)
214
+
215
+ # Extract Vietnamese text
216
+ vietnamese_texts = []
217
+ for (bbox, text, confidence) in results:
218
+ if confidence > 0.5: # Filter low-confidence detections
219
+ vietnamese_texts.append(text)
220
+
221
+ return vietnamese_texts
222
+
223
+ except Exception as e:
224
+ logger.warning(f"OCR extraction failed: {e}")
225
+ return []
226
+
227
+ def retrieve_cultural_context(self, detected_items: List[str]) -> Dict:
228
+ """Retrieve cultural context for detected items"""
229
+ if not detected_items:
230
+ return {}
231
+
232
+ # Create query from detected items
233
+ query_text = " ".join(detected_items)
234
+ query_embedding = self.sentence_encoder.encode([query_text])
235
+
236
+ # Search in cultural knowledge base
237
+ k = min(5, len(self.cultural_objects))
238
+ scores, indices = self.cultural_index.search(query_embedding.astype('float32'), k)
239
+
240
+ # Retrieve relevant cultural information
241
+ cultural_context = {}
242
+ for score, idx in zip(scores[0], indices[0]):
243
+ if score > self.config["cultural_threshold"]:
244
+ obj_name = self.cultural_objects[idx]
245
+ cultural_context[obj_name] = self.cultural_kb['objects'][obj_name]
246
+
247
+ return cultural_context
248
+
249
+ def generate_vietnamese_vqa(self, image_path: str, question: str = None) -> Dict:
250
+ """Generate Vietnamese VQA with cultural explanation"""
251
+ logger.info(f"Generating VQA for: {image_path}")
252
+
253
+ # Process image
254
+ image_analysis = self.process_image(image_path)
255
+
256
+ # Load image for Gemini
257
+ if isinstance(image_path, str):
258
+ image = Image.open(image_path)
259
+ else:
260
+ image = Image.fromarray((image_path * 255).astype(np.uint8)).convert('RGB')
261
+
262
+ # Create culturally-aware prompt
263
+ prompt = self.create_cultural_prompt(image_analysis, question)
264
+
265
+ try:
266
+ # Generate with Gemini
267
+ response = self.llm_model.generate_content([prompt, image])
268
+
269
+ # Parse response
270
+ vqa_result = self.parse_vqa_response(response.text)
271
+
272
+ # Add metadata
273
+ vqa_result.update({
274
+ "image_analysis": image_analysis,
275
+ "cultural_awareness": True,
276
+ "processing_success": True
277
+ })
278
+
279
+ return vqa_result
280
+
281
+ except Exception as e:
282
+ logger.error(f"VQA generation failed: {e}")
283
+ return {"error": str(e), "processing_success": False}
284
+
285
+ def create_cultural_prompt(self, image_analysis: Dict, question: str = None) -> str:
286
+ """Create culturally-aware prompt for VQA generation"""
287
+
288
+ prompt = f"""
289
+ Bạn là chuyên gia về văn hóa Việt Nam. Hãy phân tích hình ảnh này và tạo câu hỏi-trả lời bằng tiếng Việt.
290
+
291
+ THÔNG TIN PHÂN TÍCH:
292
+ - Text trong ảnh: {', '.join(image_analysis.get('vietnamese_text', []))}
293
+ - Đối tượng văn hóa: {', '.join(image_analysis.get('cultural_objects', []))}
294
+
295
+ BỐI CẢNH VĂN HÓA:
296
+ """
297
+
298
+ # Add cultural context
299
+ for obj_name, obj_data in image_analysis.get('cultural_context', {}).items():
300
+ prompt += f"- {obj_name}: {obj_data.get('cultural_significance', '')}\n"
301
+
302
+ prompt += f"""
303
+
304
+ YÊU CẦU:
305
+ 1. Tạo 2-3 câu hỏi về văn hóa Việt Nam (nếu không có câu hỏi cụ thể)
306
+ 2. Câu trả lời phải chính xác và có giải thích văn hóa
307
+ 3. Giải thích phải bao gồm ý nghĩa, nguồn gốc, cách sử dụng
308
+
309
+ """
310
+
311
+ if question:
312
+ prompt += f"CÂU HỎI CỤ THỂ: {question}\n"
313
+
314
+ prompt += """
315
+ FORMAT JSON:
316
+ {
317
+ "questions": [
318
+ {
319
+ "question": "Câu hỏi",
320
+ "answer": "Câu trả lời",
321
+ "explanation": "Giải thích có bối cảnh văn hóa",
322
+ "cultural_objects": ["đối tượng 1", "đối tượng 2"],
323
+ "confidence": 0.9
324
+ }
325
+ ]
326
+ }
327
+ """
328
+
329
+ return prompt
330
+
331
+ def parse_vqa_response(self, response_text: str) -> Dict:
332
+ """Parse VQA response from Gemini"""
333
+ try:
334
+ # Try to extract JSON
335
+ start_idx = response_text.find('{')
336
+ end_idx = response_text.rfind('}') + 1
337
+
338
+ if start_idx != -1 and end_idx != -1:
339
+ json_str = response_text[start_idx:end_idx]
340
+ return json.loads(json_str)
341
+ else:
342
+ # Fallback parsing
343
+ return self.fallback_parse_response(response_text)
344
+
345
+ except json.JSONDecodeError:
346
+ return self.fallback_parse_response(response_text)
347
+
348
+ def fallback_parse_response(self, text: str) -> Dict:
349
+ """Fallback parser for non-JSON responses"""
350
+ lines = text.split('\n')
351
+ result = {"questions": []}
352
+
353
+ current_q = {"question": "", "answer": "", "explanation": "", "cultural_objects": []}
354
+
355
+ for line in lines:
356
+ line = line.strip()
357
+ if 'question' in line.lower() or 'câu hỏi' in line.lower():
358
+ if ':' in line:
359
+ current_q["question"] = line.split(':', 1)[1].strip()
360
+ elif 'answer' in line.lower() or 'trả lời' in line.lower():
361
+ if ':' in line:
362
+ current_q["answer"] = line.split(':', 1)[1].strip()
363
+ elif 'explanation' in line.lower() or 'giải thích' in line.lower():
364
+ if ':' in line:
365
+ current_q["explanation"] = line.split(':', 1)[1].strip()
366
+
367
+ # If we have all required fields, add to results
368
+ if all([current_q["question"], current_q["answer"], current_q["explanation"]]):
369
+ current_q["confidence"] = 0.7 # Default confidence for fallback
370
+ result["questions"].append(current_q.copy())
371
+ current_q = {"question": "", "answer": "", "explanation": "", "cultural_objects": []}
372
+
373
+ return result
374
+
375
+ def save_results(self, results: List[Dict], output_path: str):
376
+ """Save VietMEAgent results"""
377
+ with open(output_path, 'w', encoding='utf-8') as f:
378
+ json.dump(results, f, ensure_ascii=False, indent=2)
379
+
380
+ logger.info(f"Results saved to {output_path}")
381
+
382
+
383
+ class CulturalObjectDetector:
384
+ """Detect Vietnamese cultural objects using CLIP - FIXED VERSION"""
385
+
386
+ def __init__(self, clip_model, clip_processor, device):
387
+ self.clip_model = clip_model
388
+ self.clip_processor = clip_processor
389
+ self.device = device
390
+
391
+ # Load cultural object vocabulary - EXPANDED & BILINGUAL
392
+ self.cultural_vocabulary = self.load_cultural_vocabulary()
393
+ logger.info(f"Cultural detector initialized with {len(self.cultural_vocabulary)} objects")
394
+
395
+ def load_cultural_vocabulary(self) -> List[str]:
396
+ """Load vocabulary of Vietnamese cultural objects - COMPREHENSIVE FROM CRAWL DATA"""
397
+ # English-Vietnamese pairs based on crawl_summary.json (12 categories, 507 keywords)
398
+ vocabulary_pairs = [
399
+ # ===== 1. ÂM THỰC (FOOD) =====
400
+ ("vietnamese pho soup", "phở"),
401
+ ("vietnamese banh mi sandwich", "bánh mì"),
402
+ ("vietnamese spring rolls", "gỏi cuốn"),
403
+ ("vietnamese pancake", "bánh xèo"),
404
+ ("sticky rice", "xôi"),
405
+ ("vietnamese coffee", "cà phê"),
406
+ ("vietnamese tea", "chè"),
407
+ ("rice paper", "bánh tráng"),
408
+ ("fish sauce", "nước mắm"),
409
+ ("hue beef noodle soup", "bún bò Huế"),
410
+ ("vietnamese sticky rice cake", "bánh chưng"),
411
+ ("broken rice", "cơm tấm"),
412
+ ("cao lau noodles", "cao lầu"),
413
+ ("mi quang noodles", "mì Quảng"),
414
+ ("hanoi grilled pork noodles", "bún chả"),
415
+ ("steamed rice rolls", "bánh cuốn"),
416
+ ("cha ca fish", "chả cá"),
417
+ ("grilled pork skewers", "nem nướng"),
418
+ ("vietnamese steamed buns", "bánh bao"),
419
+ ("red sticky rice", "xôi gấc"),
420
+ ("vietnamese flan", "bánh flan"),
421
+ ("grilled rice paper", "bánh tráng nướng"),
422
+ ("vietnamese filter coffee", "cà phê phin"),
423
+ ("phan thiet pancakes", "bánh căn"),
424
+ ("grilled pork vermicelli", "bún thịt nướng"),
425
+ ("mini pancakes", "bánh khọt"),
426
+ ("pork offal porridge", "cháo lòng"),
427
+ ("tapioca dumplings", "bánh bột lọc"),
428
+ ("small dumplings", "bánh ít"),
429
+ ("cylindrical sticky rice cake", "bánh tét"),
430
+ ("pounded rice cake", "bánh chày"),
431
+ ("hue imperial rice", "cơm âm phủ"),
432
+ ("fermented shrimp paste", "mắm ruốc"),
433
+ ("phu quoc fish sauce", "nước mắm Phú Quốc"),
434
+ ("chili sauce", "tương ớt"),
435
+ ("mung bean cake", "bánh đậu xanh"),
436
+ ("durian cake", "bánh pía"),
437
+ ("ben tre coconut candy", "kẹo dừa Bến Tre"),
438
+ ("tet jam", "mứt Tết"),
439
+ ("molded cake", "bánh in"),
440
+ ("pyramid dumpling", "bánh giò"),
441
+ ("black sticky rice cake", "bánh gai"),
442
+ ("fried doughnut", "bánh rán"),
443
+ ("hung yen cinnamon sausage", "chả quế Hưng Yên"),
444
+ ("fermented pork roll", "nem chua"),
445
+ ("dried shrimp", "tôm khô"),
446
+ ("shrimp paste", "mắm tôm"),
447
+ ("fish porridge", "cháo cá"),
448
+ ("sour soup", "canh chua"),
449
+ ("grilled chicken", "gà nướng"),
450
+ ("roasted duck", "vịt quay"),
451
+ ("vietnamese ham", "chả lụa"),
452
+ ("pork head cheese", "giò thủ"),
453
+ ("special sticky rice cake", "bánh chưng gù"),
454
+ ("rice cake", "bánh dày"),
455
+
456
+ # ===== 2. KIẾN TRÚC (ARCHITECTURE) =====
457
+ ("vietnamese temple", "chùa"),
458
+ ("vietnamese pagoda", "chùa"),
459
+ ("village communal house", "đình làng"),
460
+ ("stilt house", "nhà sàn"),
461
+ ("hanoi flag tower", "cột cờ Hà Nội"),
462
+ ("one pillar pagoda", "chùa Một Cột"),
463
+ ("tran quoc pagoda", "chùa Trấn Quốc"),
464
+ ("temple of literature", "Văn Miếu"),
465
+ ("ho chi minh mausoleum", "lăng Hồ Chí Minh"),
466
+ ("dragon house", "nhà rồng"),
467
+ ("ba den temple", "chùa Bà Đen"),
468
+ ("ngoc son temple", "đền Ngọc Sơn"),
469
+ ("hanoi old quarter", "phố cổ Hà Nội"),
470
+ ("hue imperial architecture", "kiến trúc Huế"),
471
+ ("an dinh palace", "cung An Định"),
472
+ ("independence palace", "dinh Độc Lập"),
473
+ ("dong xuan market", "chợ Đồng Xuân"),
474
+ ("japanese covered bridge", "cầu Nhật Bản"),
475
+ ("hoi an ancient house", "nhà cổ Hội An"),
476
+ ("terraced fields architecture", "ruộng bậc thang"),
477
+ ("notre dame cathedral", "nhà thờ Đức Bà"),
478
+ ("saigon post office", "bưu điện Sài Gòn"),
479
+ ("hanoi opera house", "nhà hát Lớn Hà Nội"),
480
+ ("long bien bridge", "cầu Long Biên"),
481
+ ("thang long imperial citadel", "hoàng thành Thăng Long"),
482
+ ("hue imperial city", "kinh thành Huế"),
483
+ ("khai dinh tomb", "lăng Khải Định"),
484
+ ("minh mang tomb", "lăng Minh Mạng"),
485
+ ("bai dinh pagoda", "chùa Bái Đính"),
486
+ ("tam chuc pagoda", "chùa Tam Chúc"),
487
+ ("hung kings temple", "đền Hùng"),
488
+ ("bach ma temple", "đền Bạch Mã"),
489
+ ("hanoi citadel gate", "cổng thành Hà Nội"),
490
+ ("turtle tower", "tháp Rùa"),
491
+ ("the huc bridge", "cầu Thê Húc"),
492
+ ("ho chi minh house", "nhà Bác Hồ"),
493
+ ("presidential palace", "phủ Chủ tịch"),
494
+ ("ba dinh square", "quảng trường Ba Đình"),
495
+ ("tu duc tomb", "lăng Tự Đức"),
496
+ ("jade emperor pagoda", "chùa Ngọc Hoàng"),
497
+ ("cao dai temple", "chùa Cao Đài"),
498
+ ("hmong stilt house", "nhà sàn H'Mông"),
499
+ ("ede longhouse", "nhà dài Ê Đê"),
500
+ ("mekong traditional house", "nhà truyền thống miền Tây"),
501
+ ("hue garden house", "nhà vườn Huế"),
502
+ ("french villa", "biệt thự Pháp"),
503
+ ("gothic architecture", "kiến trúc Gothic"),
504
+
505
+ # ===== 3. TRANG PHỤC (CLOTHING) =====
506
+ ("vietnamese traditional dress", "áo dài"),
507
+ ("conical hat", "nón lá"),
508
+ ("vietnamese traditional clothing", "trang phục truyền thống"),
509
+ ("ethnic costume", "trang phục dân tộc"),
510
+ ("vietnamese traditional shirt", "áo bà ba"),
511
+ ("thai headscarf", "khăn piêu"),
512
+ ("hmong traditional costume", "trang phục H'Mông"),
513
+ ("hue brocade dress", "áo gấm Huế"),
514
+ ("hue turban", "khăn đóng Huế"),
515
+ ("wooden shoes", "giày gỗ"),
516
+ ("four-panel dress", "áo tứ thân"),
517
+ ("traditional bra", "yếm đào"),
518
+ ("wedding ao dai", "áo dài cưới"),
519
+ ("chin strap hat", "nón quai thao"),
520
+ ("brocade fabric", "thổ cẩm"),
521
+ ("silk scarf", "khăn lụa"),
522
+ ("mens ao dai", "áo dài nam"),
523
+ ("childrens ao dai", "áo dài trẻ em"),
524
+ ("student ao dai", "áo dài học sinh"),
525
+ ("modern ao dai", "áo dài cách tân"),
526
+ ("hue conical hat", "nón lá Huế"),
527
+ ("poem hat", "nón bài thơ"),
528
+ ("southern checkered scarf", "khăn rằn Nam Bộ"),
529
+ ("traditional halter top", "áo yếm truyền thống"),
530
+ ("tay traditional costume", "trang phục Tày"),
531
+ ("nung traditional costume", "trang phục Nùng"),
532
+ ("muong traditional costume", "trang phục Mường"),
533
+ ("khmer traditional costume", "trang phục Khmer"),
534
+ ("cham traditional costume", "trang phục Chăm"),
535
+ ("cham sarong", "sarong Chăm"),
536
+ ("cham turban", "turban Chăm"),
537
+ ("ede traditional costume", "trang phục Ê Đê"),
538
+ ("co tu traditional costume", "trang phục Cơ Tu"),
539
+ ("dao traditional costume", "trang phục Dao"),
540
+ ("giay traditional costume", "trang phục Giáy"),
541
+ ("la chi traditional costume", "trang phục La Chí"),
542
+ ("brocade skirt", "váy thổ cẩm"),
543
+ ("brocade headscarf", "khăn thổ cẩm"),
544
+ ("brocade bag", "túi thổ cẩm"),
545
+ ("silver bracelet", "vòng tay bạc"),
546
+ ("silver necklace", "dây chuyền bạc"),
547
+ ("ethnic earrings", "khuyên tai dân tộc"),
548
+ ("hmong collar", "vòng cổ H'Mông"),
549
+ ("brocade belt", "thắt lưng thổ cẩm"),
550
+
551
+ # ===== 4. LỄ HỘI (FESTIVALS) =====
552
+ ("vietnamese new year", "Tết Nguyên Đán"),
553
+ ("cherry blossom festival", "lễ hội hoa anh đào"),
554
+ ("mid autumn festival", "Trung thu"),
555
+ ("hung kings festival", "lễ hội đền Hùng"),
556
+ ("hue festival", "festival Huế"),
557
+ ("perfume pagoda festival", "lễ hội chùa Hương"),
558
+ ("kate festival", "Kate festival"),
559
+ ("whale worship festival", "lễ hội cầu ngư"),
560
+ ("buffalo fighting festival", "lễ hội chọi trâu"),
561
+ ("sticky rice cake festival", "lễ hội bánh chưng"),
562
+ ("giong festival", "Gióng festival"),
563
+ ("village festival", "lễ hội làng"),
564
+ ("vietnamese wedding", "đám cưới Việt Nam"),
565
+ ("water festival", "lễ hội nước"),
566
+ ("harvest festival", "lễ hội harvest"),
567
+ ("vu lan festival", "Vu Lan festival"),
568
+ ("boat racing festival", "lễ hội đua thuyền"),
569
+ ("buffalo fighting festival", "lễ hội chọi trâu"),
570
+ ("rice harvest festival", "lễ hội hái lúa"),
571
+ ("thanksgiving festival", "lễ hội cúng ơn"),
572
+ ("ok om bok festival", "lễ hội Óc Om Bóc"),
573
+ ("don ta festival", "lễ hội Dôn Ta"),
574
+ ("khmer new year", "lễ hội Chaul Chnam Thmey"),
575
+ ("roong pooc festival", "lễ hội Roóng Poọc"),
576
+ ("nang hai festival", "lễ hội Nàng Hai"),
577
+ ("lion dance festival", "lễ hội múa lân"),
578
+ ("fireworks festival", "lễ hội pháo hoa"),
579
+ ("ban flower festival", "lễ hội hoa ban"),
580
+ ("coffee festival", "lễ hội café"),
581
+ ("con throwing festival", "lễ hội ném còn"),
582
+ ("love festival", "lễ hội tình yêu"),
583
+ ("vietnamese valentine", "Valentine Việt Nam"),
584
+ ("first full moon", "rằm tháng Giêng"),
585
+ ("cold food festival", "tết Hàn thực"),
586
+ ("doan ngo festival", "tết Đoan ngọ"),
587
+ ("seventh month full moon", "rằm tháng Bảy"),
588
+ ("mid autumn festival", "tết Trung thu"),
589
+ ("teachers day", "lễ 20/11"),
590
+ ("womens day", "lễ 8/3"),
591
+ ("hung kings commemoration", "lễ giỗ tổ Hùng Vương"),
592
+ ("national day", "lễ Quốc khánh"),
593
+
594
+ # ===== 5. THỦ CÔNG MỸ NGHỆ (HANDICRAFTS) =====
595
+ ("bat trang ceramics", "gốm sứ Bát Tràng"),
596
+ ("dong ho paintings", "tranh Đông Hồ"),
597
+ ("vietnamese embroidery", "thêu Việt Nam"),
598
+ ("weaving", "đan lát"),
599
+ ("bamboo weaving", "mây tre đan"),
600
+ ("vietnamese lacquer", "sơn mài Việt Nam"),
601
+ ("wood carving", "điêu khắc gỗ"),
602
+ ("hue ceramics", "gốm Huế"),
603
+ ("silk painting vietnam", "tranh lụa"),
604
+ ("bronze casting", "đúc đồng"),
605
+ ("stone carving", "chạm khắc"),
606
+ ("brocade weaving", "thổ cẩm dệt"),
607
+ ("chu dau ceramics", "gốm Chu Đậu"),
608
+ ("hue porcelain", "sứ Huế"),
609
+ ("phu lang ceramics", "gốm Phù Lãng"),
610
+ ("silk painting", "tranh lụa"),
611
+ ("lacquer painting", "tranh sơn mài"),
612
+ ("mother of pearl inlay", "khảm trai"),
613
+ ("wood sculpture", "tượng gỗ"),
614
+ ("stone sculpture", "tượng đá"),
615
+ ("bronze items", "đồ đồng"),
616
+ ("silver items", "đồ bạc"),
617
+ ("ethnic jewelry", "trang sức dân tộc"),
618
+ ("folk masks", "mặt nạ dân gian"),
619
+ ("water puppets", "rối nước"),
620
+ ("carpet weaving", "dệt thảm"),
621
+ ("sedge mat", "chiếu cói"),
622
+ ("handmade conical hat", "nón lá thủ công"),
623
+ ("incense making", "làm hương"),
624
+ ("do paper", "giấy dó"),
625
+ ("cake mold making", "làm bánh in"),
626
+ ("folk candy", "kẹo dân gian"),
627
+
628
+ # ===== 6. NHẠC CỤ (MUSICAL INSTRUMENTS) =====
629
+ ("vietnamese monochord", "đàn bầu"),
630
+ ("vietnamese drums", "trống"),
631
+ ("bamboo flute", "sáo trúc"),
632
+ ("vietnamese zither", "đàn tranh"),
633
+ ("moon lute", "đàn nguyệt"),
634
+ ("vietnamese pipa", "đàn tỳ bà"),
635
+ ("gourd trumpet", "kèn bầu"),
636
+ ("bronze gong", "cồng chiêng"),
637
+ ("two string fiddle", "đàn nhị"),
638
+ ("pan flute", "sáo điếu"),
639
+ ("rice drum", "trống cơm"),
640
+ ("vietnamese lute", "đàn đáy"),
641
+ ("vietnamese guitar", "đàn sến"),
642
+ ("36 string zither", "đàn tam thập lục"),
643
+ ("16 string zither", "đàn thập lục"),
644
+ ("leaf trumpet", "kèn lá"),
645
+ ("ethnic flute", "sáo mọi"),
646
+ ("ceremonial drum", "trống chầu"),
647
+ ("wooden bell", "mõ gỗ"),
648
+ ("temple bell", "chuông chùa"),
649
+ ("bronze cymbal", "chiêng đồng"),
650
+ ("kni string instrument", "đàn K'ni"),
651
+ ("trung bamboo xylophone", "đàn T'rưng"),
652
+ ("pi flute", "sáo pí"),
653
+ ("bronze drum", "trống đồng"),
654
+ ("single string instrument", "đàn bầu độc huyền"),
655
+
656
+ # ===== 7. PHONG CẢNH (LANDSCAPES) =====
657
+ ("ha long bay", "vịnh Hạ Long"),
658
+ ("sapa terraced fields", "ruộng bậc thang Sapa"),
659
+ ("mekong delta", "delta sông Mekong"),
660
+ ("hoan kiem lake", "Hồ Gươm"),
661
+ ("west lake hanoi", "Hồ Tây Hà Nội"),
662
+ ("phong nha cave", "Phong Nha cave"),
663
+ ("ba be lake", "Ba Be lake"),
664
+ ("mui ne sand dunes", "Mũi Né sand dunes"),
665
+ ("ninh binh landscape", "Ninh Bình landscape"),
666
+ ("tam coc", "Tam Cốc"),
667
+ ("hoi an ancient town", "Hội An ancient town"),
668
+ ("da lat hills", "Đà Lạt hills"),
669
+ ("can tho floating market", "Cần Thơ floating market"),
670
+ ("muong hoa valley", "Mường Hoa valley"),
671
+ ("ha long bay caves", "Hạ Long Bay caves"),
672
+ ("fansipan mountain", "núi Phan Xi Păng"),
673
+ ("dong van plateau", "cao nguyên Đồng Văn"),
674
+ ("cuc phuong national park", "vườn quốc gia Cúc Phương"),
675
+ ("u minh national park", "vườn quốc gia U Minh"),
676
+ ("phu quoc island", "đảo Phú Quốc"),
677
+ ("cat ba island", "đảo Cát Bà"),
678
+ ("thoi son islet", "cồn Thoi Son"),
679
+ ("moc chau tea hills", "đồi chè Mộc Châu"),
680
+ ("mui ne sand hills", "đồi cát Mũi Né"),
681
+ ("quy nhon beach", "biển Quy Nhon"),
682
+ ("nha trang beach", "biển Nha Trang"),
683
+ ("da nang beach", "biển Đà Nẵng"),
684
+ ("vung tau beach", "biển Vũng Tàu"),
685
+ ("ha long beach", "biển Hạ Long"),
686
+ ("sam son beach", "biển Sầm Sơn"),
687
+ ("red river", "sông Hồng"),
688
+ ("mekong river", "sông Mekong"),
689
+ ("perfume river", "sông Hương"),
690
+ ("thu bon river", "sông Thu Bồn"),
691
+ ("ban gioc waterfall", "thác Ban Giốc"),
692
+ ("can gio mangrove forest", "rừng ngập mặn Cần Giờ"),
693
+ ("tram chim forest", "rừng Tràm Chim"),
694
+ ("yok don national park", "vườn quốc gia Yok Đôn"),
695
+
696
+ # ===== 8. VĂN HÓA DÂN GIAN (FOLK CULTURE) =====
697
+ ("water puppet show", "múa rối nước"),
698
+ ("ca tru performance", "Ca trù performance"),
699
+ ("cheo opera", "Chèo opera"),
700
+ ("cai luong opera", "Cải lương"),
701
+ ("tuong classical opera", "Tuồng classical opera"),
702
+ ("vietnamese folklore", "văn hóa dân gian"),
703
+ ("dragon dance", "múa rồng"),
704
+ ("lion dance", "múa lân"),
705
+ ("traditional storytelling", "kể chuyện"),
706
+ ("vietnamese folk songs", "hát dân ca"),
707
+ ("quan ho singing", "quan họ singing"),
708
+ ("hat van ritual", "hát văn ritual"),
709
+ ("xam singing", "xẩm singing"),
710
+ ("folk tales vietnam", "folk tales Vietnam"),
711
+ ("thang long water puppets", "rối nước Thăng Long"),
712
+ ("traditional dance", "múa truyền thống"),
713
+ ("sap dance", "múa sạp"),
714
+ ("xoang dance", "múa xoang"),
715
+ ("shadow dance", "múa bóng rỗi"),
716
+ ("silk dance", "múa lụa"),
717
+ ("lullaby", "hát ru"),
718
+ ("bac ninh quan ho", "hát quan họ Bắc Ninh"),
719
+ ("chau van singing", "hát chầu văn"),
720
+ ("vi giam folk song", "ví giặm"),
721
+ ("ho khoan work song", "hò khoan"),
722
+ ("soong co singing", "hát soong cọ"),
723
+ ("quan ho folk song", "dân ca quan họ"),
724
+ ("xoan singing", "hát xoan"),
725
+ ("hue royal music", "ca Huế"),
726
+ ("nghe tinh folk song", "hò Nghệ Tĩnh"),
727
+
728
+ # ===== 9. GIAO THÔNG (TRANSPORTATION) =====
729
+ ("vietnamese motorbike", "xe máy"),
730
+ ("cyclo vietnam", "xích lô"),
731
+ ("motorbike taxi", "xe ôm"),
732
+ ("mekong boat", "thuyền Mekong"),
733
+ ("vietnamese train", "tàu hỏa"),
734
+ ("vietnamese transportation", "giao thông Việt Nam"),
735
+ ("traditional boat vietnam", "thuyền truyền thống"),
736
+ ("basket boat", "thúng chai"),
737
+ ("dragon boat vietnam", "thuyền rồng"),
738
+ ("vietnamese bus", "xe buýt"),
739
+ ("vietnamese taxi", "taxi"),
740
+ ("grab bike", "grab bike"),
741
+ ("electric vehicle vietnam", "xe điện"),
742
+ ("round boat", "thuyền thúng"),
743
+ ("cargo boat", "ghe bầu"),
744
+ ("kayak vietnam", "thuyền kayak"),
745
+ ("ox cart", "xe bò"),
746
+ ("buffalo cart", "xe trâu"),
747
+ ("palanquin vietnam", "kiệu"),
748
+ ("wedding palanquin", "kiệu hoa"),
749
+ ("three wheeler", "xe lam"),
750
+ ("ferry boat", "đò nang"),
751
+
752
+ # ===== 10. ĐỜI SỐNG HÀNG NGÀY (DAILY LIFE) =====
753
+ ("vietnamese market", "chợ Việt Nam"),
754
+ ("street food vietnam", "street food Vietnam"),
755
+ ("coffee shop vietnam", "coffee shop Vietnam"),
756
+ ("vietnamese family", "gia đình Việt Nam"),
757
+ ("vietnam daily life", "đời s���ng hàng ngày"),
758
+ ("rice farming vietnam", "rice farming Vietnam"),
759
+ ("fishing village vietnam", "fishing village Vietnam"),
760
+ ("vietnamese school", "trường học Việt Nam"),
761
+ ("traditional market", "chợ truyền thống"),
762
+ ("vietnamese wedding", "đám cưới Việt Nam"),
763
+ ("tet celebration family", "Tết gia đình"),
764
+ ("vietnamese kitchen", "nhà bếp Việt Nam"),
765
+ ("can tho floating market", "chợ nổi Cần Thơ"),
766
+ ("ben thanh market", "chợ Bến Thành"),
767
+ ("dong xuan market", "chợ Đồng Xuân"),
768
+ ("countryside market", "chợ quê"),
769
+ ("craft village vietnam", "làng nghề"),
770
+ ("pottery village", "làng gốm"),
771
+ ("weaving village", "làng dệt"),
772
+ ("fishing village", "làng chài"),
773
+ ("vietnamese farmer", "nông dân"),
774
+ ("rice harvest", "thu hoạch lúa"),
775
+ ("rice planting", "cấy lúa"),
776
+ ("rice threshing", "đập lúa"),
777
+ ("shrimp farming", "nuôi tôm"),
778
+ ("fish farming", "nuôi cá"),
779
+ ("buffalo herding", "chăn trâu bò"),
780
+ ("duck herding", "chăn vịt"),
781
+ ("family meal", "bữa cơm gia đình"),
782
+ ("ancestor altar", "bàn thờ gia tiên"),
783
+ ("vietnamese student", "học sinh Việt Nam"),
784
+ ("classroom", "lớp học"),
785
+ ("playground", "sân chơi"),
786
+ ("vietnamese neighborhood", "khu phố"),
787
+ ("sidewalk cafe", "quán cà phê vỉa hè"),
788
+ ("street food stall", "quán ăn đường phố"),
789
+ ("street vendor", "xe hàng rong"),
790
+ ("daily work", "công việc hàng ngày"),
791
+ ("rural life", "sinh hoạt làng quê"),
792
+ ("city life", "đời sống thành phố"),
793
+
794
+ # ===== 11. TRÒ CHƠI DÂN GIAN (TRADITIONAL GAMES) =====
795
+ ("tug of war vietnam", "kéo co"),
796
+ ("shuttlecock kicking", "đá cầu"),
797
+ ("bamboo dancing", "nhảy sạp"),
798
+ ("kite flying", "thả diều"),
799
+ ("o an quan game", "ô ăn quan"),
800
+ ("blind mans bluff", "bịt mắt bắt dê"),
801
+ ("stick hitting game", "đánh khăng"),
802
+ ("pot breaking game", "đập niêu"),
803
+ ("buffalo fighting", "chọi trâu"),
804
+ ("swing game", "đu tiên"),
805
+ ("vietnamese traditional games", "trò chơi dân gian"),
806
+ ("village wrestling", "hội vật làng"),
807
+ ("traditional jump rope", "nhảy dây truyền thống"),
808
+ ("bamboo spinning top", "đánh quay tre"),
809
+ ("bamboo ring throwing", "thả vòng tre"),
810
+ ("con throwing", "tung còn"),
811
+ ("traditional wrestling", "vật truyền thống"),
812
+ ("cockfighting vietnam", "chọi gà"),
813
+ ("spinning top vietnam", "đánh quay"),
814
+ ("hide and seek vietnam", "trốn tìm"),
815
+ ("stilts walking", "đi cà kheo"),
816
+ ("shuttlecock passing", "chơi chuyền"),
817
+ ("badminton throwing", "ném gà bông"),
818
+ ("marble shooting", "bắn bi"),
819
+ ("hopscotch vietnam", "chơi lò cò"),
820
+ ("tree climbing", "trèo cây"),
821
+ ("river swimming", "bơi sông"),
822
+ ("boat racing", "đua thuyền"),
823
+ ("dragon dancing", "múa rồng"),
824
+ ("children lion dance", "múa lân trẻ em"),
825
+ ("drum playing", "đánh trống"),
826
+ ("flute playing", "thổi kèn"),
827
+ ("instrument playing", "chơi đàn"),
828
+ ("storytelling", "kể chuyện"),
829
+ ("poetry reciting", "đọc thơ"),
830
+
831
+ # ===== 12. THỂ THAO TRUYỀN THỐNG (TRADITIONAL SPORTS) =====
832
+ ("dragon boat racing vietnam", "đua thuyền rồng"),
833
+ ("vietnamese traditional wrestling", "vật cổ truyền"),
834
+ ("stick pushing", "đẩy gậy"),
835
+ ("crossbow shooting", "bắn nỏ"),
836
+ ("sepak takraw vietnam", "cầu mây"),
837
+ ("vietnamese martial arts", "võ cổ truyền"),
838
+ ("lion dragon competition", "lân sư rồng thi đấu"),
839
+ ("vietnamese chess", "cờ tướng"),
840
+ ("traditional stick fighting", "đánh gậy truyền thống"),
841
+ ("ghe ngo boat racing", "đua ghe ngo"),
842
+ ("bay nui ox racing", "đua bò Bảy Núi"),
843
+ ("ha long kayak racing", "đua thuyền kayak Hạ Long"),
844
+ ("vovinam demonstration", "vovinam biểu diễn"),
845
+ ("vietnamese boxing", "muay Việt Nam"),
846
+ ("binh dinh martial arts", "võ Bình Định"),
847
+ ("tay son martial arts", "võ Tây Sơn"),
848
+ ("traditional weapons", "kim khí"),
849
+ ("nunchaku", "côn nhị khúc"),
850
+ ("tai chi vietnam", "thái cực quyền"),
851
+ ("boxing vietnam", "quy���n anh"),
852
+ ("judo vietnam", "judo"),
853
+ ("wrestling vietnam", "đấu vật"),
854
+ ("weightlifting vietnam", "cử tạ"),
855
+ ("swimming vietnam", "bơi lội"),
856
+ ("cycling racing", "đua xe đạp"),
857
+ ("marathon vietnam", "marathon"),
858
+ ("badminton vietnam", "cầu lông"),
859
+ ("tennis vietnam", "tennis"),
860
+ ("table tennis vietnam", "bóng bàn"),
861
+ ("karate vietnam", "karatedo"),
862
+ ("taekwondo vietnam", "taekwondo"),
863
+ ("football vietnam", "bóng đá"),
864
+ ("beach volleyball vietnam", "bóng chuyền bãi biển"),
865
+ ("street basketball vietnam", "bóng rổ đường phố"),
866
+ ("athletics sea games vietnam", "điền kinh SEA Games"),
867
+ ("kickboxing vietnam", "kickboxing"),
868
+ ("mma vietnam", "MMA"),
869
+ ("gymnastics vietnam", "gymnastics"),
870
+ ("diving vietnam", "diving"),
871
+
872
+ # ===== GENERAL CULTURAL TERMS =====
873
+ ("vietnamese culture", "văn hóa Việt Nam"),
874
+ ("traditional festival", "lễ hội truyền thống"),
875
+ ("vietnamese tradition", "truyền thống Việt Nam"),
876
+ ("vietnamese heritage", "di sản Việt Nam"),
877
+ ("folk culture", "văn hóa dân gian"),
878
+ ("traditional art", "nghệ thuật truyền thống"),
879
+ ("vietnamese customs", "phong tục Việt Nam"),
880
+ ("cultural performance", "biểu diễn văn hóa"),
881
+ ("ethnic minority", "dân tộc thiểu số"),
882
+ ("cultural identity", "bản sắc văn hóa"),
883
+ ]
884
+
885
+ # Extract English terms for CLIP detection
886
+ english_terms = [pair[0] for pair in vocabulary_pairs]
887
+ vietnamese_terms = [pair[1] for pair in vocabulary_pairs]
888
+
889
+ # Store mapping for result translation
890
+ self.en_to_vi_mapping = dict(vocabulary_pairs)
891
+
892
+ logger.info(f"Loaded comprehensive cultural vocabulary: {len(english_terms)} items across 12 categories")
893
+ return english_terms
894
+
895
+ def detect_objects(self, image: Image.Image, threshold: float = 0.15) -> List[str]:
896
+ """Detect cultural objects in image using CLIP - IMPROVED"""
897
+ try:
898
+ # Prepare image and text inputs - MULTIPLE TEMPLATES
899
+ templates = [
900
+ "a photo of {}",
901
+ "an image showing {}",
902
+ "{}",
903
+ "traditional {}",
904
+ "vietnamese {}"
905
+ ]
906
+
907
+ all_text_inputs = []
908
+ all_labels = []
909
+
910
+ for obj in self.cultural_vocabulary:
911
+ for template in templates:
912
+ text_input = template.format(obj)
913
+ all_text_inputs.append(text_input)
914
+ all_labels.append(obj)
915
+
916
+ # Process in batches to avoid memory issues
917
+ batch_size = 50
918
+ all_probs = []
919
+
920
+ for i in range(0, len(all_text_inputs), batch_size):
921
+ batch_texts = all_text_inputs[i:i+batch_size]
922
+
923
+ inputs = self.clip_processor(
924
+ text=batch_texts,
925
+ images=image,
926
+ return_tensors="pt",
927
+ padding=True
928
+ ).to(self.device)
929
+
930
+ # Get predictions
931
+ with torch.no_grad():
932
+ outputs = self.clip_model(**inputs)
933
+ logits_per_image = outputs.logits_per_image
934
+ probs = logits_per_image.softmax(dim=1)
935
+
936
+ all_probs.extend(probs[0].cpu().numpy())
937
+
938
+ # Group probabilities by object (average across templates)
939
+ object_probs = {}
940
+ for i, (prob, label) in enumerate(zip(all_probs, all_labels)):
941
+ if label not in object_probs:
942
+ object_probs[label] = []
943
+ object_probs[label].append(prob)
944
+
945
+ # Average probabilities and filter
946
+ detected_objects = []
947
+ for obj, probs in object_probs.items():
948
+ avg_prob = np.mean(probs)
949
+ max_prob = np.max(probs)
950
+
951
+ # Use both average and max for decision
952
+ final_score = (avg_prob * 0.3 + max_prob * 0.7)
953
+
954
+ if final_score > threshold:
955
+ # Translate back to Vietnamese
956
+ vietnamese_name = self.en_to_vi_mapping.get(obj, obj)
957
+ detected_objects.append(vietnamese_name)
958
+ logger.debug(f"Detected {obj} -> {vietnamese_name} (score: {final_score:.3f})")
959
+
960
+ return detected_objects
961
+
962
+ except Exception as e:
963
+ logger.warning(f"Object detection failed: {e}")
964
+ return []