Spaces:

Dangindev
/

vietmeagent

Sleeping

App Files Files Community

Dangindev commited on Jun 29, 2025

Commit

b0ce04d

verified ·

1 Parent(s): c15cc4e

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

core/evaluation.py +628 -0
core/post_hoc_explainer.py +418 -0
core/viet_meagent.py +964 -0

core/evaluation.py ADDED Viewed

	@@ -0,0 +1,628 @@

+import json
+import numpy as np
+from typing import Dict, List
+import logging
+from rouge_score import rouge_scorer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from sentence_transformers import SentenceTransformer
+import matplotlib.pyplot as plt
+import seaborn as sns
+import re
+logger = logging.getLogger(__name__)
+class VietMEAgentEvaluator:
+    """Comprehensive evaluation for VietMEAgent - FIXED VERSION"""
+    def __init__(self, cultural_kb_path: str):
+        # Load cultural knowledge for evaluation
+        with open(cultural_kb_path, 'r', encoding='utf-8') as f:
+            self.cultural_kb = json.load(f)
+        # Initialize evaluation tools
+        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
+        self.sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+        self.smoothing = SmoothingFunction().method1
+        # Cultural object vocabulary - EXPANDED
+        self.cultural_vocabulary = set()
+        for obj_name, obj_data in self.cultural_kb['objects'].items():
+            self.cultural_vocabulary.add(obj_name.lower())
+            # Add variations
+            if 'name' in obj_data:
+                self.cultural_vocabulary.add(obj_data['name'].lower())
+        # Additional common Vietnamese cultural terms
+        additional_terms = [
+            'phở', 'bánh mì', 'áo dài', 'nón lá', 'chùa', 'đình', 'làng', 'thờ',
+            'tết', 'trung thu', 'gỏi cuốn', 'bánh xèo', 'cà phê', 'trúc', 'tre',
+            'đàn bầu', 'trống', 'sáo', 'múa lân', 'rối nước', 'việt nam'
+        ]
+        self.cultural_vocabulary.update(additional_terms)
+        logger.info(f"Initialized evaluator with {len(self.cultural_vocabulary)} cultural terms")
+    def evaluate_batch(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict:
+        """Evaluate a batch of predictions"""
+        logger.info(f"Evaluating {len(predictions)} predictions against {len(ground_truth)} ground truth")
+        results = {
+            'language_quality': {},
+            'cultural_relevance': {},
+            'visual_grounding': {},
+            'overall_performance': {}
+        }
+        # Language quality metrics
+        results['language_quality'] = self.evaluate_language_quality(predictions, ground_truth)
+        # Cultural relevance metrics
+        results['cultural_relevance'] = self.evaluate_cultural_relevance(predictions, ground_truth)
+        # Visual grounding metrics
+        results['visual_grounding'] = self.evaluate_visual_grounding(predictions, ground_truth)
+        # Overall performance
+        results['overall_performance'] = self.calculate_overall_performance(results)
+        # Debug metrics
+        self.debug_evaluation_results(results, predictions, ground_truth)
+        return results
+    def debug_evaluation_results(self, results: Dict, predictions: List[Dict], ground_truth: List[Dict]):
+        """Debug evaluation results"""
+        logger.info("=== EVALUATION DEBUG ===")
+        # Sample text comparison
+        if predictions and ground_truth:
+            pred_text = self.extract_text_from_prediction(predictions[0])
+            gt_text = self.extract_text_from_ground_truth(ground_truth[0])
+            logger.info(f"Sample prediction text: {pred_text[:100]}...")
+            logger.info(f"Sample ground truth text: {gt_text[:100]}...")
+            # Cultural objects
+            pred_cultural = self.extract_cultural_objects(predictions[0])
+            gt_cultural = self.extract_cultural_objects(ground_truth[0])
+            logger.info(f"Pred cultural objects: {pred_cultural}")
+            logger.info(f"GT cultural objects: {gt_cultural}")
+        logger.info("=== END DEBUG ===")
+    def evaluate_language_quality(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict:
+        """Evaluate language quality using BLEU and ROUGE - IMPROVED"""
+        bleu_scores = []
+        rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
+        valid_comparisons = 0
+        for pred, gt in zip(predictions, ground_truth):
+            # Extract text for comparison - IMPROVED
+            pred_text = self.extract_text_from_prediction(pred)
+            gt_text = self.extract_text_from_ground_truth(gt)
+            if pred_text and gt_text:
+                # Clean and normalize text
+                pred_clean = self.clean_vietnamese_text(pred_text)
+                gt_clean = self.clean_vietnamese_text(gt_text)
+                if pred_clean and gt_clean:
+                    valid_comparisons += 1
+                    # BLEU score - IMPROVED tokenization
+                    pred_tokens = self.tokenize_vietnamese(pred_clean)
+                    gt_tokens = self.tokenize_vietnamese(gt_clean)
+                    if pred_tokens and gt_tokens:
+                        # Use multiple reference for better BLEU
+                        references = [gt_tokens]
+                        # Add variations
+                        if len(gt_tokens) > 3:
+                            references.append(gt_tokens[:-1])  # Remove last word
+                            references.append(gt_tokens[1:])   # Remove first word
+                        bleu = sentence_bleu(
+                            references,
+                            pred_tokens,
+                            smoothing_function=self.smoothing,
+                            weights=(0.5, 0.3, 0.2)  # Give more weight to unigrams and bigrams
+                        )
+                        bleu_scores.append(bleu)
+                    # ROUGE scores
+                    try:
+                        rouge_result = self.rouge_scorer.score(pred_clean, gt_clean)
+                        for metric in rouge_scores:
+                            rouge_scores[metric].append(rouge_result[metric].fmeasure)
+                    except Exception as e:
+                        logger.warning(f"ROUGE calculation failed: {e}")
+        logger.info(f"Language quality: {valid_comparisons} valid comparisons out of {len(predictions)}")
+        return {
+            'bleu': np.mean(bleu_scores) if bleu_scores else 0.0,
+            'rouge1': np.mean(rouge_scores['rouge1']) if rouge_scores['rouge1'] else 0.0,
+            'rouge2': np.mean(rouge_scores['rouge2']) if rouge_scores['rouge2'] else 0.0,
+            'rougeL': np.mean(rouge_scores['rougeL']) if rouge_scores['rougeL'] else 0.0,
+            'num_evaluated': valid_comparisons
+        }
+    def clean_vietnamese_text(self, text: str) -> str:
+        """Clean and normalize Vietnamese text"""
+        if not text:
+            return ""
+        # Convert to lowercase
+        text = text.lower()
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        # Remove special characters but keep Vietnamese diacritics
+        text = re.sub(r'[^\w\sàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text)
+        return text
+    def tokenize_vietnamese(self, text: str) -> List[str]:
+        """Tokenize Vietnamese text"""
+        if not text:
+            return []
+        # Simple word-based tokenization
+        tokens = text.split()
+        # Filter out very short tokens
+        tokens = [t for t in tokens if len(t) > 1]
+        return tokens
+    def evaluate_cultural_relevance(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict:
+        """Evaluate cultural relevance of predictions - IMPROVED"""
+        cultural_precision = []
+        cultural_recall = []
+        cultural_accuracy = []
+        cultural_mentions = []
+        for pred, gt in zip(predictions, ground_truth):
+            # Extract cultural objects - IMPROVED
+            pred_cultural = self.extract_cultural_objects(pred)
+            gt_cultural = self.extract_cultural_objects(gt)
+            # Count cultural mentions in text
+            pred_text = self.extract_text_from_prediction(pred)
+            gt_text = self.extract_text_from_ground_truth(gt)
+            pred_mentions = self.count_cultural_mentions(pred_text)
+            gt_mentions = self.count_cultural_mentions(gt_text)
+            cultural_mentions.append({
+                'pred_mentions': pred_mentions,
+                'gt_mentions': gt_mentions,
+                'mention_overlap': len(set(pred_mentions).intersection(set(gt_mentions)))
+            })
+            # If we have ground truth cultural objects
+            if gt_cultural or gt_mentions:
+                all_gt_cultural = gt_cultural.union(set(gt_mentions))
+                all_pred_cultural = pred_cultural.union(set(pred_mentions))
+                if all_pred_cultural:
+                    precision = len(all_pred_cultural.intersection(all_gt_cultural)) / len(all_pred_cultural)
+                    cultural_precision.append(precision)
+                if all_gt_cultural:
+                    recall = len(all_pred_cultural.intersection(all_gt_cultural)) / len(all_gt_cultural)
+                    cultural_recall.append(recall)
+                # Cultural context accuracy using semantic similarity
+                if pred_text and gt_text:
+                    cultural_acc = self.evaluate_cultural_context_accuracy(pred, gt)
+                    cultural_accuracy.append(cultural_acc)
+        # Calculate cultural mention accuracy
+        mention_accuracy = 0.0
+        if cultural_mentions:
+            total_overlap = sum(m['mention_overlap'] for m in cultural_mentions)
+            total_gt_mentions = sum(len(m['gt_mentions']) for m in cultural_mentions)
+            mention_accuracy = total_overlap / total_gt_mentions if total_gt_mentions > 0 else 0.0
+        return {
+            'cultural_precision': np.mean(cultural_precision) if cultural_precision else 0.0,
+            'cultural_recall': np.mean(cultural_recall) if cultural_recall else 0.0,
+            'cultural_accuracy': np.mean(cultural_accuracy) if cultural_accuracy else 0.0,
+            'cultural_mention_accuracy': mention_accuracy,
+            'cultural_f1': self.calculate_f1(
+                np.mean(cultural_precision) if cultural_precision else 0.0,
+                np.mean(cultural_recall) if cultural_recall else 0.0
+            ),
+            'num_cultural_samples': len(cultural_mentions)
+        }
+    def count_cultural_mentions(self, text: str) -> List[str]:
+        """Count mentions of cultural terms in text"""
+        if not text:
+            return []
+        text_lower = text.lower()
+        mentions = []
+        for cultural_term in self.cultural_vocabulary:
+            if cultural_term in text_lower:
+                mentions.append(cultural_term)
+        return mentions
+    def evaluate_visual_grounding(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict:
+        """Evaluate visual grounding accuracy - IMPROVED"""
+        grounding_scores = []
+        detection_accuracy = []
+        heatmap_quality = []
+        for pred, gt in zip(predictions, ground_truth):
+            # Heatmap-based grounding evaluation
+            if 'heatmap' in pred:
+                heatmap = np.array(pred['heatmap']) if isinstance(pred['heatmap'], list) else pred['heatmap']
+                # Basic heatmap quality metrics
+                if heatmap.size > 0:
+                    concentration = np.std(heatmap)
+                    coverage = np.mean(heatmap > 0.3)
+                    max_attention = np.max(heatmap)
+                    # Simple quality score
+                    quality_score = min(1.0, (concentration * 2 + coverage + max_attention) / 3)
+                    heatmap_quality.append(quality_score)
+                    # If we have ground truth regions, calculate IoU
+                    if 'attention_regions' in gt:
+                        iou = self.calculate_grounding_accuracy(heatmap, gt['attention_regions'])
+                        grounding_scores.append(iou)
+                    else:
+                        # Use heatmap quality as proxy for grounding
+                        grounding_scores.append(quality_score * 0.5)  # Lower weight without GT
+            # Object detection accuracy
+            pred_objects = []
+            if 'image_analysis' in pred and 'cultural_objects' in pred['image_analysis']:
+                pred_objects = pred['image_analysis']['cultural_objects']
+            elif 'cultural_objects' in pred:
+                pred_objects = pred['cultural_objects']
+            gt_objects = []
+            if 'image_analysis' in gt and 'cultural_objects' in gt['image_analysis']:
+                gt_objects = gt['image_analysis']['cultural_objects']
+            elif 'cultural_objects' in gt:
+                gt_objects = gt['cultural_objects']
+            if gt_objects or pred_objects:
+                detection_acc = self.calculate_detection_accuracy(pred_objects, gt_objects)
+                detection_accuracy.append(detection_acc)
+        return {
+            'visual_grounding': np.mean(grounding_scores) if grounding_scores else 0.0,
+            'detection_accuracy': np.mean(detection_accuracy) if detection_accuracy else 0.0,
+            'heatmap_quality': np.mean(heatmap_quality) if heatmap_quality else 0.0,
+            'num_grounding_samples': len(grounding_scores),
+            'num_detection_samples': len(detection_accuracy)
+        }
+    def extract_text_from_prediction(self, prediction: Dict) -> str:
+        """Extract text from prediction for evaluation - IMPROVED"""
+        texts = []
+        # Extract from questions
+        if 'questions' in prediction:
+            for q in prediction['questions']:
+                if 'explanation' in q and q['explanation']:
+                    texts.append(str(q['explanation']))
+                if 'answer' in q and q['answer']:
+                    texts.append(str(q['answer']))
+                if 'question' in q and q['question']:
+                    texts.append(str(q['question']))
+        # Extract from vietnamese_explanation
+        if 'vietnamese_explanation' in prediction and prediction['vietnamese_explanation']:
+            texts.append(str(prediction['vietnamese_explanation']))
+        # Extract from image analysis
+        if 'image_analysis' in prediction:
+            analysis = prediction['image_analysis']
+            if 'vietnamese_text' in analysis:
+                texts.extend([str(t) for t in analysis['vietnamese_text'] if t])
+        return ' '.join(texts)
+    def extract_text_from_ground_truth(self, ground_truth: Dict) -> str:
+        """Extract text from ground truth for evaluation - IMPROVED"""
+        texts = []
+        # Extract from questions
+        if 'questions' in ground_truth:
+            for q in ground_truth['questions']:
+                if 'explanation' in q and q['explanation']:
+                    texts.append(str(q['explanation']))
+                if 'answer' in q and q['answer']:
+                    texts.append(str(q['answer']))
+                if 'question' in q and q['question']:
+                    texts.append(str(q['question']))
+        # Extract from image analysis
+        if 'image_analysis' in ground_truth:
+            analysis = ground_truth['image_analysis']
+            if 'vietnamese_text' in analysis:
+                texts.extend([str(t) for t in analysis['vietnamese_text'] if t])
+        return ' '.join(texts)
+    def extract_cultural_objects(self, data: Dict) -> set:
+        """Extract cultural objects mentioned in data - IMPROVED"""
+        cultural_objects = set()
+        # Get all text from the data
+        text = ""
+        if 'questions' in data:
+            text = self.extract_text_from_prediction(data)
+        else:
+            text = self.extract_text_from_ground_truth(data)
+        text_lower = text.lower()
+        # Find cultural terms in text
+        for cultural_term in self.cultural_vocabulary:
+            if cultural_term in text_lower:
+                cultural_objects.add(cultural_term)
+        # Also check explicit cultural_objects fields
+        if 'cultural_objects' in data:
+            for obj in data['cultural_objects']:
+                cultural_objects.add(str(obj).lower())
+        if 'image_analysis' in data and 'cultural_objects' in data['image_analysis']:
+            for obj in data['image_analysis']['cultural_objects']:
+                cultural_objects.add(str(obj).lower())
+        return cultural_objects
+    def evaluate_cultural_context_accuracy(self, prediction: Dict, ground_truth: Dict) -> float:
+        """Evaluate accuracy of cultural context understanding - IMPROVED"""
+        # Extract cultural explanations
+        pred_text = self.extract_text_from_prediction(prediction)
+        gt_text = self.extract_text_from_ground_truth(ground_truth)
+        if not pred_text or not gt_text:
+            return 0.0
+        # Clean texts
+        pred_clean = self.clean_vietnamese_text(pred_text)
+        gt_clean = self.clean_vietnamese_text(gt_text)
+        if not pred_clean or not gt_clean:
+            return 0.0
+        try:
+            # Use semantic similarity for cultural context evaluation
+            pred_embedding = self.sentence_model.encode([pred_clean])
+            gt_embedding = self.sentence_model.encode([gt_clean])
+            # Calculate cosine similarity
+            similarity = np.dot(pred_embedding[0], gt_embedding[0]) / (
+                np.linalg.norm(pred_embedding[0]) * np.linalg.norm(gt_embedding[0])
+            )
+            return max(0.0, float(similarity))  # Ensure non-negative
+        except Exception as e:
+            logger.warning(f"Cultural context accuracy calculation failed: {e}")
+            return 0.0
+    def calculate_grounding_accuracy(self, pred_heatmap: np.ndarray, gt_regions: List) -> float:
+        """Calculate visual grounding accuracy"""
+        if len(gt_regions) == 0 or pred_heatmap.size == 0:
+            return 0.0
+        try:
+            # Ensure heatmap is 2D
+            if pred_heatmap.ndim > 2:
+                pred_heatmap = pred_heatmap.reshape(-1, pred_heatmap.shape[-1])
+            # Create ground truth mask
+            gt_mask = np.zeros_like(pred_heatmap)
+            for region in gt_regions:
+                if isinstance(region, (list, tuple)) and len(region) >= 4:
+                    x, y, w, h = region[:4]
+                    x, y, w, h = int(x), int(y), int(w), int(h)
+                    # Ensure bounds
+                    x = max(0, min(x, gt_mask.shape[1] - 1))
+                    y = max(0, min(y, gt_mask.shape[0] - 1))
+                    w = max(1, min(w, gt_mask.shape[1] - x))
+                    h = max(1, min(h, gt_mask.shape[0] - y))
+                    gt_mask[y:y+h, x:x+w] = 1
+            # Threshold prediction heatmap
+            pred_mask = (pred_heatmap > 0.5).astype(np.float32)
+            # Calculate IoU
+            intersection = np.logical_and(pred_mask, gt_mask).sum()
+            union = np.logical_or(pred_mask, gt_mask).sum()
+            return float(intersection / union) if union > 0 else 0.0
+        except Exception as e:
+            logger.warning(f"Grounding accuracy calculation failed: {e}")
+            return 0.0
+    def calculate_detection_accuracy(self, pred_objects: List, gt_objects: List) -> float:
+        """Calculate object detection accuracy - IMPROVED"""
+        if not gt_objects and not pred_objects:
+            return 1.0
+        if not gt_objects:
+            return 0.0 if pred_objects else 1.0
+        # Convert to lowercase and clean
+        pred_set = set(str(obj).lower().strip() for obj in pred_objects if obj)
+        gt_set = set(str(obj).lower().strip() for obj in gt_objects if obj)
+        if not gt_set:
+            return 1.0 if not pred_set else 0.0
+        # Calculate Jaccard similarity (IoU for sets)
+        intersection = len(pred_set.intersection(gt_set))
+        union = len(pred_set.union(gt_set))
+        return intersection / union if union > 0 else 0.0
+    def calculate_f1(self, precision: float, recall: float) -> float:
+        """Calculate F1 score"""
+        if precision + recall == 0:
+            return 0.0
+        return 2 * (precision * recall) / (precision + recall)
+    def calculate_overall_performance(self, results: Dict) -> Dict:
+        """Calculate overall performance metrics - IMPROVED"""
+        # Weight different aspects
+        weights = {
+            'language_quality': 0.4,     # Increased weight
+            'cultural_relevance': 0.4,   # Increased weight
+            'visual_grounding': 0.2      # Decreased weight (often no GT data)
+        }
+        # Calculate weighted average using multiple metrics
+        overall_score = 0.0
+        component_scores = {}
+        for aspect, weight in weights.items():
+            if aspect in results:
+                if aspect == 'language_quality':
+                    # Average of ROUGE-L and BLEU (ROUGE usually more reliable for Vietnamese)
+                    rouge_l = results[aspect].get('rougeL', 0.0)
+                    bleu = results[aspect].get('bleu', 0.0)
+                    score = (rouge_l * 0.7 + bleu * 0.3)  # Weight ROUGE-L higher
+                elif aspect == 'cultural_relevance':
+                    # Average of multiple cultural metrics
+                    cult_acc = results[aspect].get('cultural_accuracy', 0.0)
+                    cult_f1 = results[aspect].get('cultural_f1', 0.0)
+                    mention_acc = results[aspect].get('cultural_mention_accuracy', 0.0)
+                    score = (cult_acc * 0.4 + cult_f1 * 0.3 + mention_acc * 0.3)
+                elif aspect == 'visual_grounding':
+                    # Average of grounding metrics
+                    grounding = results[aspect].get('visual_grounding', 0.0)
+                    detection = results[aspect].get('detection_accuracy', 0.0)
+                    heatmap_q = results[aspect].get('heatmap_quality', 0.0)
+                    score = (grounding * 0.4 + detection * 0.4 + heatmap_q * 0.2)
+                component_scores[aspect] = score
+                overall_score += weight * score
+        return {
+            'overall_score': overall_score,
+            'component_scores': component_scores,
+            'weights': weights
+        }
+    def generate_evaluation_report(self, results: Dict, save_path: str = None) -> str:
+        """Generate comprehensive evaluation report - IMPROVED"""
+        report = f"""
+VietMEAgent Evaluation Report
+{'='*50}
+Language Quality:
+  BLEU Score: {results['language_quality']['bleu']:.4f}
+  ROUGE-1: {results['language_quality']['rouge1']:.4f}
+  ROUGE-2: {results['language_quality']['rouge2']:.4f}
+  ROUGE-L: {results['language_quality']['rougeL']:.4f}
+  Samples Evaluated: {results['language_quality']['num_evaluated']}
+Cultural Relevance:
+  Cultural Precision: {results['cultural_relevance']['cultural_precision']:.4f}
+  Cultural Recall: {results['cultural_relevance']['cultural_recall']:.4f}
+  Cultural F1: {results['cultural_relevance']['cultural_f1']:.4f}
+  Cultural Accuracy: {results['cultural_relevance']['cultural_accuracy']:.4f}
+  Cultural Mention Accuracy: {results['cultural_relevance']['cultural_mention_accuracy']:.4f}
+  Cultural Samples: {results['cultural_relevance']['num_cultural_samples']}
+Visual Grounding:
+  Grounding Accuracy: {results['visual_grounding']['visual_grounding']:.4f}
+  Detection Accuracy: {results['visual_grounding']['detection_accuracy']:.4f}
+  Heatmap Quality: {results['visual_grounding']['heatmap_quality']:.4f}
+  Grounding Samples: {results['visual_grounding']['num_grounding_samples']}
+  Detection Samples: {results['visual_grounding']['num_detection_samples']}
+Overall Performance:
+  Overall Score: {results['overall_performance']['overall_score']:.4f}
+  Component Scores: {results['overall_performance']['component_scores']}
+{'='*50}
+"""
+        if save_path:
+            with open(save_path, 'w', encoding='utf-8') as f:
+                f.write(report)
+            logger.info(f"Evaluation report saved to {save_path}")
+        return report
+    def plot_evaluation_results(self, results: Dict, save_path: str = None):
+        """Plot evaluation results - IMPROVED"""
+        # Create subplots
+        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+        # Language Quality
+        lang_metrics = ['bleu', 'rouge1', 'rouge2', 'rougeL']
+        lang_scores = [results['language_quality'][m] for m in lang_metrics]
+        axes[0, 0].bar(lang_metrics, lang_scores, color='skyblue')
+        axes[0, 0].set_title('Language Quality Metrics')
+        axes[0, 0].set_ylim(0, 1)
+        axes[0, 0].tick_params(axis='x', rotation=45)
+        # Cultural Relevance
+        cult_metrics = ['cultural_precision', 'cultural_recall', 'cultural_f1', 'cultural_accuracy']
+        cult_scores = [results['cultural_relevance'][m] for m in cult_metrics]
+        axes[0, 1].bar(cult_metrics, cult_scores, color='lightcoral')
+        axes[0, 1].set_title('Cultural Relevance Metrics')
+        axes[0, 1].set_ylim(0, 1)
+        axes[0, 1].tick_params(axis='x', rotation=45)
+        # Visual Grounding
+        visual_metrics = ['visual_grounding', 'detection_accuracy', 'heatmap_quality']
+        visual_scores = [results['visual_grounding'][m] for m in visual_metrics]
+        axes[1, 0].bar(visual_metrics, visual_scores, color='lightgreen')
+        axes[1, 0].set_title('Visual Grounding Metrics')
+        axes[1, 0].set_ylim(0, 1)
+        axes[1, 0].tick_params(axis='x', rotation=45)
+        # Overall comparison
+        overall_metrics = ['Language Quality', 'Cultural Relevance', 'Visual Grounding']
+        component_scores = results['overall_performance']['component_scores']
+        overall_scores = [
+            component_scores.get('language_quality', 0),
+            component_scores.get('cultural_relevance', 0),
+            component_scores.get('visual_grounding', 0)
+        ]
+        axes[1, 1].bar(overall_metrics, overall_scores, color='gold')
+        axes[1, 1].set_title('Overall Performance Comparison')
+        axes[1, 1].set_ylim(0, 1)
+        axes[1, 1].tick_params(axis='x', rotation=45)
+        plt.tight_layout()
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+            logger.info(f"Evaluation plots saved to {save_path}")
+        plt.show()
+        return fig

core/post_hoc_explainer.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import torch
+import torch.nn.functional as F
+import cv2
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+from transformers import CLIPProcessor, CLIPModel
+import logging
+logger = logging.getLogger(__name__)
+class PostHocExplainer:
+    """
+    Post-hoc explanation module for generating visual explanations
+    Implements heatmaps to show which image regions influenced the answer
+    """
+    def __init__(self, clip_model, clip_processor=None, device='cuda'):
+        self.clip_model = clip_model
+        self.clip_processor = clip_processor
+        self.device = device
+        # Validate inputs
+        if self.clip_model is None:
+            raise ValueError("CLIP model cannot be None")
+        if self.clip_processor is None:
+            logger.warning("CLIP processor is None, some methods may not work")
+        # Set model to evaluation mode
+        self.clip_model.eval()
+        logger.info("PostHocExplainer initialized with CLIP model")
+    def generate_heatmap(self, image, question_text=None, method='attention_rollout'):
+        """Generate heatmap showing important image regions for VQA"""
+        logger.info(f"Generating heatmap using method: {method}")
+        try:
+            if method == 'attention_rollout':
+                return self.generate_attention_rollout_heatmap(image, question_text)
+            elif method == 'gradient_based':
+                return self.generate_gradient_heatmap(image, question_text)
+            elif method == 'occlusion':
+                return self.generate_occlusion_heatmap(image, question_text)
+            else:
+                logger.warning(f"Unknown method {method}, using attention_rollout")
+                return self.generate_attention_rollout_heatmap(image, question_text)
+        except Exception as e:
+            logger.error(f"Heatmap generation failed: {e}")
+            logger.info("Using fallback center-focused heatmap")
+            return self.create_center_fallback_heatmap()
+    def generate_attention_rollout_heatmap(self, image, question_text=None):
+        """Generate heatmap using attention rollout method"""
+        logger.info("Generating attention rollout heatmap")
+        try:
+            # Check if processor is available
+            if self.clip_processor is None:
+                raise ValueError("CLIP processor is required for attention rollout")
+            # Prepare inputs
+            if question_text is None:
+                question_text = "What is in this image?"
+            # Process image and text with truncation
+            inputs = self.clip_processor(
+                text=[question_text],
+                images=image,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=77  # CLIP's maximum token length
+            ).to(self.device)
+            logger.info("Running forward pass with attention outputs")
+            # Get attention weights
+            with torch.no_grad():
+                outputs = self.clip_model(**inputs, output_attentions=True)
+                # Try different ways to access vision attention
+                vision_attentions = None
+                # Method 1: Direct access
+                if hasattr(outputs, 'vision_model_output') and outputs.vision_model_output is not None:
+                    if hasattr(outputs.vision_model_output, 'attentions'):
+                        vision_attentions = outputs.vision_model_output.attentions
+                        logger.info("Found vision attentions via vision_model_output")
+                # Method 2: Check if attentions are in main output
+                if vision_attentions is None and hasattr(outputs, 'attentions'):
+                    vision_attentions = outputs.attentions
+                    logger.info("Found attentions in main output")
+                # If still no attention, create fallback
+                if vision_attentions is None or len(vision_attentions) == 0:
+                    logger.warning("No attention weights found, creating uniform attention")
+                    attention_2d = torch.ones(7, 7) / 49
+                else:
+                    # Extract attention from last layer
+                    last_attention = vision_attentions[-1]  # Last layer
+                    # Average across heads and batch
+                    attention_map = last_attention.mean(dim=1)[0]  # [seq_len, seq_len]
+                    # Get spatial attention (excluding CLS token)
+                    spatial_attention = attention_map[1:, 1:]  # Remove CLS token
+                    # Reshape to spatial dimensions
+                    patch_size = int(np.sqrt(spatial_attention.shape[0]))
+                    if spatial_attention.shape[0] == patch_size * patch_size:
+                        attention_2d = spatial_attention.mean(dim=1).reshape(patch_size, patch_size)
+                        logger.info(f"Reshaped attention to {patch_size}x{patch_size}")
+                    else:
+                        logger.warning(f"Cannot reshape attention {spatial_attention.shape}, using uniform")
+                        attention_2d = torch.ones(7, 7) / 49
+                # Resize to 224x224
+                attention_2d = F.interpolate(
+                    attention_2d.unsqueeze(0).unsqueeze(0),
+                    size=(224, 224),
+                    mode='bilinear',
+                    align_corners=False
+                ).squeeze().cpu().numpy()
+                # Normalize to [0, 1]
+                attention_2d = (attention_2d - attention_2d.min()) / (attention_2d.max() - attention_2d.min() + 1e-8)
+                logger.info(f"Generated attention heatmap with shape {attention_2d.shape}")
+                return attention_2d
+        except Exception as e:
+            logger.warning(f"Attention rollout failed: {e}, using gradient method")
+            return self.generate_gradient_heatmap(image, question_text)
+    def generate_gradient_heatmap(self, image, question_text=None):
+        """Generate heatmap using gradient-based method"""
+        logger.info("Generating gradient-based heatmap")
+        try:
+            if self.clip_processor is None:
+                raise ValueError("CLIP processor is required for gradient method")
+            if question_text is None:
+                question_text = "What is in this image?"
+            # Enable gradient computation
+            self.clip_model.train()
+            # Process inputs with truncation
+            inputs = self.clip_processor(
+                text=[question_text],
+                images=image,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=77  # CLIP's maximum token length
+            ).to(self.device)
+            # Require gradients for pixel values
+            inputs['pixel_values'].requires_grad_(True)
+            logger.info("Running forward pass for gradients")
+            # Forward pass
+            outputs = self.clip_model(**inputs)
+            # Get image-text similarity score
+            logits_per_image = outputs.logits_per_image[0, 0]
+            logger.info("Computing gradients")
+            # Backward pass
+            logits_per_image.backward()
+            # Get gradients
+            gradients = inputs['pixel_values'].grad[0]  # [C, H, W]
+            # Create heatmap from gradients
+            heatmap = torch.norm(gradients, dim=0).cpu().numpy()  # [H, W]
+            # Normalize
+            heatmap = (heatmap - heatmap.min()) / (heatmap.max() - heatmap.min() + 1e-8)
+            # Reset model to eval mode
+            self.clip_model.eval()
+            logger.info(f"Generated gradient heatmap with shape {heatmap.shape}")
+            return heatmap
+        except Exception as e:
+            logger.warning(f"Gradient method failed: {e}, using occlusion method")
+            return self.generate_occlusion_heatmap(image, question_text)
+    def generate_occlusion_heatmap(self, image, question_text=None, patch_size=32):
+        """Generate heatmap using occlusion method"""
+        logger.info("Generating occlusion-based heatmap")
+        try:
+            if self.clip_processor is None:
+                raise ValueError("CLIP processor is required for occlusion method")
+            if question_text is None:
+                question_text = "What is in this image?"
+            # Convert to numpy for processing
+            if isinstance(image, Image.Image):
+                image_np = np.array(image)
+            else:
+                image_np = image
+            # Resize to standard size
+            image_resized = cv2.resize(image_np, (224, 224))
+            image_pil = Image.fromarray(image_resized)
+            logger.info("Getting baseline score")
+            # Get baseline score
+            inputs_baseline = self.clip_processor(
+                text=[question_text],
+                images=image_pil,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=77  # CLIP's maximum token length
+            ).to(self.device)
+            with torch.no_grad():
+                baseline_output = self.clip_model(**inputs_baseline)
+                baseline_score = baseline_output.logits_per_image[0, 0].cpu().item()
+            logger.info(f"Baseline score: {baseline_score}")
+            # Create heatmap
+            heatmap = np.zeros((224, 224))
+            # Occlude different regions
+            num_patches = 224 // patch_size
+            logger.info(f"Testing {num_patches}x{num_patches} patches")
+            for y in range(0, 224, patch_size):
+                for x in range(0, 224, patch_size):
+                    try:
+                        # Create occluded image
+                        occluded_image = image_resized.copy()
+                        y_end = min(y + patch_size, 224)
+                        x_end = min(x + patch_size, 224)
+                        occluded_image[y:y_end, x:x_end] = 128  # Gray patch
+                        # Get score with occlusion
+                        occluded_pil = Image.fromarray(occluded_image)
+                        inputs_occluded = self.clip_processor(
+                            text=[question_text],
+                            images=occluded_pil,
+                            return_tensors="pt",
+                            padding=True,
+                            truncation=True,
+                            max_length=77  # CLIP's maximum token length
+                        ).to(self.device)
+                        with torch.no_grad():
+                            occluded_output = self.clip_model(**inputs_occluded)
+                            occluded_score = occluded_output.logits_per_image[0, 0].cpu().item()
+                        # Importance = baseline - occluded (higher drop = more important)
+                        importance = baseline_score - occluded_score
+                        heatmap[y:y_end, x:x_end] = importance
+                    except Exception as e:
+                        logger.warning(f"Occlusion patch ({x},{y}) failed: {e}")
+                        continue
+            # Normalize heatmap
+            heatmap = np.maximum(heatmap, 0)  # Keep only positive values
+            if heatmap.max() > 0:
+                heatmap = heatmap / heatmap.max()
+            logger.info(f"Generated occlusion heatmap with shape {heatmap.shape}")
+            return heatmap
+        except Exception as e:
+            logger.error(f"Occlusion method failed: {e}")
+            return self.create_center_fallback_heatmap()
+    def create_center_fallback_heatmap(self):
+        """Create a center-focused fallback heatmap"""
+        logger.info("Creating fallback center-focused heatmap")
+        heatmap = np.zeros((224, 224))
+        center_y, center_x = 112, 112
+        for y in range(224):
+            for x in range(224):
+                distance = np.sqrt((y - center_y)**2 + (x - center_x)**2)
+                heatmap[y, x] = max(0, 1 - distance / 112)
+        return heatmap
+    def visualize_explanation(self, image, heatmap, title="VQA Explanation", save_path=None):
+        """Visualize heatmap overlay on original image"""
+        try:
+            # Prepare original image
+            if isinstance(image, Image.Image):
+                image_np = np.array(image)
+            else:
+                image_np = image
+            # Resize image to match heatmap
+            image_resized = cv2.resize(image_np, (heatmap.shape[1], heatmap.shape[0]))
+            image_resized = image_resized.astype(np.float32) / 255.0
+            # Create visualization
+            plt.figure(figsize=(15, 5))
+            # Original image
+            plt.subplot(1, 3, 1)
+            plt.imshow(image_resized)
+            plt.title("Original Image")
+            plt.axis('off')
+            # Heatmap
+            plt.subplot(1, 3, 2)
+            plt.imshow(heatmap, cmap='hot', interpolation='bilinear')
+            plt.title("Attention Heatmap")
+            plt.axis('off')
+            plt.colorbar()
+            # Overlay
+            plt.subplot(1, 3, 3)
+            plt.imshow(image_resized)
+            plt.imshow(heatmap, cmap='hot', alpha=0.6, interpolation='bilinear')
+            plt.title(title)
+            plt.axis('off')
+            plt.tight_layout()
+            if save_path:
+                plt.savefig(save_path, dpi=300, bbox_inches='tight')
+                logger.info(f"Visualization saved to {save_path}")
+            plt.close()  # Close to prevent display in headless environment
+            return image_resized
+        except Exception as e:
+            logger.error(f"Visualization failed: {e}")
+            return None
+class VietnameseExplanationGenerator:
+    """Generate Vietnamese explanations for VQA results"""
+    def __init__(self, cultural_kb):
+        self.cultural_kb = cultural_kb
+        # Vietnamese explanation templates
+        self.templates = {
+            'food': "Trong ảnh có {object}, đây là {description}. {cultural_significance}",
+            'clothing': "Trang phục {object} trong ảnh thể hiện {cultural_significance}",
+            'architecture': "Kiến trúc {object} mang đặc trưng {description}",
+            'activity': "Hoạt động {object} có ý nghĩa {cultural_significance}",
+            'general': "Đối tượng {object} trong văn hóa Việt Nam {description}"
+        }
+    def generate_explanation(self, question, answer, cultural_objects, heatmap=None):
+        """Generate Vietnamese cultural explanation"""
+        try:
+            explanations = []
+            # Base explanation
+            base_explanation = f"Câu trả lời '{answer}' được đưa ra dựa trên phân tích hình ảnh."
+            explanations.append(base_explanation)
+            # Cultural explanations
+            for obj in cultural_objects:
+                if obj in self.cultural_kb['objects']:
+                    obj_data = self.cultural_kb['objects'][obj]
+                    category = obj_data.get('category', 'general')
+                    template = self.templates.get(category, self.templates['general'])
+                    cultural_exp = template.format(
+                        object=obj,
+                        description=obj_data.get('description', ''),
+                        cultural_significance=obj_data.get('cultural_significance', '')
+                    )
+                    explanations.append(cultural_exp)
+            # Visual attention explanation
+            if heatmap is not None:
+                attention_exp = self.generate_attention_explanation(heatmap)
+                explanations.append(attention_exp)
+            return " ".join(explanations)
+        except Exception as e:
+            logger.warning(f"Explanation generation failed: {e}")
+            return f"Phân tích hình ảnh cho câu hỏi: {question}"
+    def generate_attention_explanation(self, heatmap):
+        """Generate explanation about visual attention"""
+        try:
+            # Calculate attention statistics
+            max_attention = np.max(heatmap)
+            mean_attention = np.mean(heatmap)
+            if max_attention > 0.8:
+                return "Mô hình tập trung cao độ vào một vùng cụ thể trong ảnh."
+            elif mean_attention > 0.5:
+                return "Mô hình phân tán sự chú ý trên nhiều vùng khác nhau."
+            else:
+                return "Mô hình có sự chú ý tương đối đều trên toàn bộ ảnh."
+        except Exception as e:
+            logger.warning(f"Attention explanation failed: {e}")
+            return "Phân tích sự chú ý của mô hình."

core/viet_meagent.py ADDED Viewed

	@@ -0,0 +1,964 @@

+import torch
+import torch.nn as nn
+import json
+import cv2
+import numpy as np
+from PIL import Image
+import google.generativeai as genai
+from typing import Dict, List, Tuple, Optional
+import logging
+from transformers import CLIPProcessor, CLIPModel
+import easyocr
+from sentence_transformers import SentenceTransformer
+import faiss
+import os
+logger = logging.getLogger(__name__)
+class VietMEAgent:
+    """
+    VietMEAgent: Culturally-Aware Few-Shot Multimodal Explanation
+    for Vietnamese Visual Question Answering - FIXED CULTURAL DETECTION
+    """
+    def __init__(self, config_path: str = "configs/vietmeagent_config.json"):
+        self.config = self.load_config(config_path)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize components
+        self.setup_models()
+        self.load_cultural_knowledge()
+        self.setup_few_shot_examples()
+        logger.info(f"VietMEAgent initialized on {self.device}")
+    def load_config(self, config_path: str) -> Dict:
+        """Load VietMEAgent configuration"""
+        try:
+            with open(config_path, 'r', encoding='utf-8') as f:
+                config = json.load(f)
+            # Flatten nested config for backward compatibility
+            flat_config = {}
+            # Extract model_config keys to top level
+            if 'model_config' in config:
+                flat_config.update(config['model_config'])
+            # Add other sections
+            for section, values in config.items():
+                if section != 'model_config' and isinstance(values, dict):
+                    flat_config[section] = values
+                elif section != 'model_config':
+                    flat_config[section] = values
+            # Override with environment variables if available
+            if os.getenv('GEMINI_API_KEY'):
+                flat_config['gemini_api_key'] = os.getenv('GEMINI_API_KEY')
+            if os.getenv('CULTURAL_THRESHOLD'):
+                flat_config['cultural_threshold'] = float(os.getenv('CULTURAL_THRESHOLD'))
+            if os.getenv('MAX_FEW_SHOT_EXAMPLES'):
+                flat_config['max_few_shot_examples'] = int(os.getenv('MAX_FEW_SHOT_EXAMPLES'))
+            return flat_config
+        except FileNotFoundError:
+            # Default config if file not found - use environment variables first
+            default_config = {
+                "gemini_api_key": os.getenv('GEMINI_API_KEY', "AIzaSyCgatP7izHkaBn6im8AfXq0Ufmb0Fr-7dc"),
+                "max_few_shot_examples": int(os.getenv('MAX_FEW_SHOT_EXAMPLES', 16)),
+                "cultural_threshold": float(os.getenv('CULTURAL_THRESHOLD', 0.15)),
+                "explanation_max_length": 200,
+                "heatmap_resolution": (224, 224),
+                "paths": {
+                    "cultural_kb": os.getenv('CULTURAL_KB_PATH', "data/cultural_kb/vietnamese_cultural_knowledge.json"),
+                    "vqa_dataset": os.getenv('VQA_DATASET_PATH', "data/annotations/vietnamese_vqa_dataset.json"),
+                    "output_dir": os.getenv('OUTPUT_DIR', "results")
+                }
+            }
+            return default_config
+    def setup_models(self):
+        """Initialize all required models"""
+        logger.info("Setting up models...")
+        # 1. Gemini for LLM reasoning
+        genai.configure(api_key=self.config["gemini_api_key"])
+        self.llm_model = genai.GenerativeModel('gemini-1.5-flash')
+        # 2. CLIP for vision-language understanding
+        logger.info("Loading CLIP model...")
+        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
+        # 3. Vietnamese OCR
+        logger.info("Setting up Vietnamese OCR...")
+        self.ocr_reader = easyocr.Reader(['vi', 'en'], gpu=torch.cuda.is_available())
+        # 4. Sentence encoder for cultural similarity
+        logger.info("Loading sentence encoder...")
+        self.sentence_encoder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+        # 5. Cultural object detector (using CLIP for now)
+        logger.info("Setting up cultural object detector...")
+        self.cultural_detector = CulturalObjectDetector(
+            self.clip_model, self.clip_processor, self.device
+        )
+        logger.info("All models loaded successfully!")
+    def load_cultural_knowledge(self):
+        """Load Vietnamese cultural knowledge base"""
+        kb_path = self.config["paths"]["cultural_kb"]
+        with open(kb_path, 'r', encoding='utf-8') as f:
+            self.cultural_kb = json.load(f)
+        # Create cultural embeddings for fast retrieval
+        self.create_cultural_embeddings()
+        logger.info(f"Cultural KB loaded with {len(self.cultural_kb['objects'])} objects")
+    def create_cultural_embeddings(self):
+        """Create embeddings for cultural objects for fast similarity search"""
+        cultural_texts = []
+        self.cultural_objects = []
+        for obj_name, obj_data in self.cultural_kb['objects'].items():
+            text = f"{obj_name} {obj_data['description']} {obj_data['cultural_significance']}"
+            cultural_texts.append(text)
+            self.cultural_objects.append(obj_name)
+        # Create embeddings
+        embeddings = self.sentence_encoder.encode(cultural_texts)
+        # Build FAISS index for fast retrieval
+        self.cultural_index = faiss.IndexFlatIP(embeddings.shape[1])
+        self.cultural_index.add(embeddings.astype('float32'))
+        logger.info("Cultural embeddings created")
+    def setup_few_shot_examples(self):
+        """Load few-shot examples from VQA dataset"""
+        vqa_path = self.config["paths"]["vqa_dataset"]
+        with open(vqa_path, 'r', encoding='utf-8') as f:
+            vqa_data = json.load(f)
+        # Select diverse examples across categories
+        self.few_shot_examples = self.select_diverse_examples(
+            vqa_data, k=self.config["max_few_shot_examples"]
+        )
+        logger.info(f"Selected {len(self.few_shot_examples)} few-shot examples")
+    def select_diverse_examples(self, vqa_data: List[Dict], k: int = 16) -> List[Dict]:
+        """Select diverse examples across categories for few-shot learning"""
+        examples_by_category = {}
+        for item in vqa_data:
+            category = item.get('category', 'unknown')
+            if category not in examples_by_category:
+                examples_by_category[category] = []
+            examples_by_category[category].append(item)
+        # Select examples from each category
+        selected_examples = []
+        examples_per_category = max(1, k // len(examples_by_category))
+        for category, examples in examples_by_category.items():
+            # Sort by quality (number of questions) and select best
+            examples.sort(key=lambda x: len(x.get('questions', [])), reverse=True)
+            selected_examples.extend(examples[:examples_per_category])
+        return selected_examples[:k]
+    def process_image(self, image_path: str) -> Dict:
+        """Process image through complete VietMEAgent pipeline"""
+        logger.info(f"Processing image: {image_path}")
+        # Load image
+        if isinstance(image_path, str):
+            image = Image.open(image_path).convert('RGB')
+        else:
+            # Handle numpy array input
+            image = Image.fromarray((image_path * 255).astype(np.uint8)).convert('RGB')
+        # 1. Extract Vietnamese text
+        vietnamese_text = self.extract_vietnamese_text(image)
+        # 2. Detect cultural objects - IMPROVED
+        cultural_objects = self.cultural_detector.detect_objects(image)
+        logger.info(f"Detected cultural objects: {cultural_objects}")
+        # 3. Retrieve cultural context
+        cultural_context = self.retrieve_cultural_context(cultural_objects + vietnamese_text)
+        # 4. Generate program and explanation
+        result = {
+            "image_path": image_path if isinstance(image_path, str) else "processed_array",
+            "vietnamese_text": vietnamese_text,
+            "cultural_objects": cultural_objects,
+            "cultural_context": cultural_context,
+            "processed_successfully": True
+        }
+        return result
+    def extract_vietnamese_text(self, image: Image.Image) -> List[str]:
+        """Extract Vietnamese text from image using OCR"""
+        try:
+            # Convert PIL to cv2 format
+            img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            # Run OCR
+            results = self.ocr_reader.readtext(img_cv)
+            # Extract Vietnamese text
+            vietnamese_texts = []
+            for (bbox, text, confidence) in results:
+                if confidence > 0.5:  # Filter low-confidence detections
+                    vietnamese_texts.append(text)
+            return vietnamese_texts
+        except Exception as e:
+            logger.warning(f"OCR extraction failed: {e}")
+            return []
+    def retrieve_cultural_context(self, detected_items: List[str]) -> Dict:
+        """Retrieve cultural context for detected items"""
+        if not detected_items:
+            return {}
+        # Create query from detected items
+        query_text = " ".join(detected_items)
+        query_embedding = self.sentence_encoder.encode([query_text])
+        # Search in cultural knowledge base
+        k = min(5, len(self.cultural_objects))
+        scores, indices = self.cultural_index.search(query_embedding.astype('float32'), k)
+        # Retrieve relevant cultural information
+        cultural_context = {}
+        for score, idx in zip(scores[0], indices[0]):
+            if score > self.config["cultural_threshold"]:
+                obj_name = self.cultural_objects[idx]
+                cultural_context[obj_name] = self.cultural_kb['objects'][obj_name]
+        return cultural_context
+    def generate_vietnamese_vqa(self, image_path: str, question: str = None) -> Dict:
+        """Generate Vietnamese VQA with cultural explanation"""
+        logger.info(f"Generating VQA for: {image_path}")
+        # Process image
+        image_analysis = self.process_image(image_path)
+        # Load image for Gemini
+        if isinstance(image_path, str):
+            image = Image.open(image_path)
+        else:
+            image = Image.fromarray((image_path * 255).astype(np.uint8)).convert('RGB')
+        # Create culturally-aware prompt
+        prompt = self.create_cultural_prompt(image_analysis, question)
+        try:
+            # Generate with Gemini
+            response = self.llm_model.generate_content([prompt, image])
+            # Parse response
+            vqa_result = self.parse_vqa_response(response.text)
+            # Add metadata
+            vqa_result.update({
+                "image_analysis": image_analysis,
+                "cultural_awareness": True,
+                "processing_success": True
+            })
+            return vqa_result
+        except Exception as e:
+            logger.error(f"VQA generation failed: {e}")
+            return {"error": str(e), "processing_success": False}
+    def create_cultural_prompt(self, image_analysis: Dict, question: str = None) -> str:
+        """Create culturally-aware prompt for VQA generation"""
+        prompt = f"""
+Bạn là chuyên gia về văn hóa Việt Nam. Hãy phân tích hình ảnh này và tạo câu hỏi-trả lời bằng tiếng Việt.
+THÔNG TIN PHÂN TÍCH:
+- Text trong ảnh: {', '.join(image_analysis.get('vietnamese_text', []))}
+- Đối tượng văn hóa: {', '.join(image_analysis.get('cultural_objects', []))}
+BỐI CẢNH VĂN HÓA:
+"""
+        # Add cultural context
+        for obj_name, obj_data in image_analysis.get('cultural_context', {}).items():
+            prompt += f"- {obj_name}: {obj_data.get('cultural_significance', '')}\n"
+        prompt += f"""
+YÊU CẦU:
+1. Tạo 2-3 câu hỏi về văn hóa Việt Nam (nếu không có câu hỏi cụ thể)
+2. Câu trả lời phải chính xác và có giải thích văn hóa
+3. Giải thích phải bao gồm ý nghĩa, nguồn gốc, cách sử dụng
+"""
+        if question:
+            prompt += f"CÂU HỎI CỤ THỂ: {question}\n"
+        prompt += """
+FORMAT JSON:
+{
+  "questions": [
+    {
+      "question": "Câu hỏi",
+      "answer": "Câu trả lời",
+      "explanation": "Giải thích có bối cảnh văn hóa",
+      "cultural_objects": ["đối tượng 1", "đối tượng 2"],
+      "confidence": 0.9
+    }
+  ]
+}
+"""
+        return prompt
+    def parse_vqa_response(self, response_text: str) -> Dict:
+        """Parse VQA response from Gemini"""
+        try:
+            # Try to extract JSON
+            start_idx = response_text.find('{')
+            end_idx = response_text.rfind('}') + 1
+            if start_idx != -1 and end_idx != -1:
+                json_str = response_text[start_idx:end_idx]
+                return json.loads(json_str)
+            else:
+                # Fallback parsing
+                return self.fallback_parse_response(response_text)
+        except json.JSONDecodeError:
+            return self.fallback_parse_response(response_text)
+    def fallback_parse_response(self, text: str) -> Dict:
+        """Fallback parser for non-JSON responses"""
+        lines = text.split('\n')
+        result = {"questions": []}
+        current_q = {"question": "", "answer": "", "explanation": "", "cultural_objects": []}
+        for line in lines:
+            line = line.strip()
+            if 'question' in line.lower() or 'câu hỏi' in line.lower():
+                if ':' in line:
+                    current_q["question"] = line.split(':', 1)[1].strip()
+            elif 'answer' in line.lower() or 'trả lời' in line.lower():
+                if ':' in line:
+                    current_q["answer"] = line.split(':', 1)[1].strip()
+            elif 'explanation' in line.lower() or 'giải thích' in line.lower():
+                if ':' in line:
+                    current_q["explanation"] = line.split(':', 1)[1].strip()
+                    # If we have all required fields, add to results
+                    if all([current_q["question"], current_q["answer"], current_q["explanation"]]):
+                        current_q["confidence"] = 0.7  # Default confidence for fallback
+                        result["questions"].append(current_q.copy())
+                        current_q = {"question": "", "answer": "", "explanation": "", "cultural_objects": []}
+        return result
+    def save_results(self, results: List[Dict], output_path: str):
+        """Save VietMEAgent results"""
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        logger.info(f"Results saved to {output_path}")
+class CulturalObjectDetector:
+    """Detect Vietnamese cultural objects using CLIP - FIXED VERSION"""
+    def __init__(self, clip_model, clip_processor, device):
+        self.clip_model = clip_model
+        self.clip_processor = clip_processor
+        self.device = device
+        # Load cultural object vocabulary - EXPANDED & BILINGUAL
+        self.cultural_vocabulary = self.load_cultural_vocabulary()
+        logger.info(f"Cultural detector initialized with {len(self.cultural_vocabulary)} objects")
+    def load_cultural_vocabulary(self) -> List[str]:
+        """Load vocabulary of Vietnamese cultural objects - COMPREHENSIVE FROM CRAWL DATA"""
+        # English-Vietnamese pairs based on crawl_summary.json (12 categories, 507 keywords)
+        vocabulary_pairs = [
+            # ===== 1. ÂM THỰC (FOOD) =====
+            ("vietnamese pho soup", "phở"),
+            ("vietnamese banh mi sandwich", "bánh mì"),
+            ("vietnamese spring rolls", "gỏi cuốn"),
+            ("vietnamese pancake", "bánh xèo"),
+            ("sticky rice", "xôi"),
+            ("vietnamese coffee", "cà phê"),
+            ("vietnamese tea", "chè"),
+            ("rice paper", "bánh tráng"),
+            ("fish sauce", "nước mắm"),
+            ("hue beef noodle soup", "bún bò Huế"),
+            ("vietnamese sticky rice cake", "bánh chưng"),
+            ("broken rice", "cơm tấm"),
+            ("cao lau noodles", "cao lầu"),
+            ("mi quang noodles", "mì Quảng"),
+            ("hanoi grilled pork noodles", "bún chả"),
+            ("steamed rice rolls", "bánh cuốn"),
+            ("cha ca fish", "chả cá"),
+            ("grilled pork skewers", "nem nướng"),
+            ("vietnamese steamed buns", "bánh bao"),
+            ("red sticky rice", "xôi gấc"),
+            ("vietnamese flan", "bánh flan"),
+            ("grilled rice paper", "bánh tráng nướng"),
+            ("vietnamese filter coffee", "cà phê phin"),
+            ("phan thiet pancakes", "bánh căn"),
+            ("grilled pork vermicelli", "bún thịt nướng"),
+            ("mini pancakes", "bánh khọt"),
+            ("pork offal porridge", "cháo lòng"),
+            ("tapioca dumplings", "bánh bột lọc"),
+            ("small dumplings", "bánh ít"),
+            ("cylindrical sticky rice cake", "bánh tét"),
+            ("pounded rice cake", "bánh chày"),
+            ("hue imperial rice", "cơm âm phủ"),
+            ("fermented shrimp paste", "mắm ruốc"),
+            ("phu quoc fish sauce", "nước mắm Phú Quốc"),
+            ("chili sauce", "tương ớt"),
+            ("mung bean cake", "bánh đậu xanh"),
+            ("durian cake", "bánh pía"),
+            ("ben tre coconut candy", "kẹo dừa Bến Tre"),
+            ("tet jam", "mứt Tết"),
+            ("molded cake", "bánh in"),
+            ("pyramid dumpling", "bánh giò"),
+            ("black sticky rice cake", "bánh gai"),
+            ("fried doughnut", "bánh rán"),
+            ("hung yen cinnamon sausage", "chả quế Hưng Yên"),
+            ("fermented pork roll", "nem chua"),
+            ("dried shrimp", "tôm khô"),
+            ("shrimp paste", "mắm tôm"),
+            ("fish porridge", "cháo cá"),
+            ("sour soup", "canh chua"),
+            ("grilled chicken", "gà nướng"),
+            ("roasted duck", "vịt quay"),
+            ("vietnamese ham", "chả lụa"),
+            ("pork head cheese", "giò thủ"),
+            ("special sticky rice cake", "bánh chưng gù"),
+            ("rice cake", "bánh dày"),
+            # ===== 2. KIẾN TRÚC (ARCHITECTURE) =====
+            ("vietnamese temple", "chùa"),
+            ("vietnamese pagoda", "chùa"),
+            ("village communal house", "đình làng"),
+            ("stilt house", "nhà sàn"),
+            ("hanoi flag tower", "cột cờ Hà Nội"),
+            ("one pillar pagoda", "chùa Một Cột"),
+            ("tran quoc pagoda", "chùa Trấn Quốc"),
+            ("temple of literature", "Văn Miếu"),
+            ("ho chi minh mausoleum", "lăng Hồ Chí Minh"),
+            ("dragon house", "nhà rồng"),
+            ("ba den temple", "chùa Bà Đen"),
+            ("ngoc son temple", "đền Ngọc Sơn"),
+            ("hanoi old quarter", "phố cổ Hà Nội"),
+            ("hue imperial architecture", "kiến trúc Huế"),
+            ("an dinh palace", "cung An Định"),
+            ("independence palace", "dinh Độc Lập"),
+            ("dong xuan market", "chợ Đồng Xuân"),
+            ("japanese covered bridge", "cầu Nhật Bản"),
+            ("hoi an ancient house", "nhà cổ Hội An"),
+            ("terraced fields architecture", "ruộng bậc thang"),
+            ("notre dame cathedral", "nhà thờ Đức Bà"),
+            ("saigon post office", "bưu điện Sài Gòn"),
+            ("hanoi opera house", "nhà hát Lớn Hà Nội"),
+            ("long bien bridge", "cầu Long Biên"),
+            ("thang long imperial citadel", "hoàng thành Thăng Long"),
+            ("hue imperial city", "kinh thành Huế"),
+            ("khai dinh tomb", "lăng Khải Định"),
+            ("minh mang tomb", "lăng Minh Mạng"),
+            ("bai dinh pagoda", "chùa Bái Đính"),
+            ("tam chuc pagoda", "chùa Tam Chúc"),
+            ("hung kings temple", "đền Hùng"),
+            ("bach ma temple", "đền Bạch Mã"),
+            ("hanoi citadel gate", "cổng thành Hà Nội"),
+            ("turtle tower", "tháp Rùa"),
+            ("the huc bridge", "cầu Thê Húc"),
+            ("ho chi minh house", "nhà Bác Hồ"),
+            ("presidential palace", "phủ Chủ tịch"),
+            ("ba dinh square", "quảng trường Ba Đình"),
+            ("tu duc tomb", "lăng Tự Đức"),
+            ("jade emperor pagoda", "chùa Ngọc Hoàng"),
+            ("cao dai temple", "chùa Cao Đài"),
+            ("hmong stilt house", "nhà sàn H'Mông"),
+            ("ede longhouse", "nhà dài Ê Đê"),
+            ("mekong traditional house", "nhà truyền thống miền Tây"),
+            ("hue garden house", "nhà vườn Huế"),
+            ("french villa", "biệt thự Pháp"),
+            ("gothic architecture", "kiến trúc Gothic"),
+            # ===== 3. TRANG PHỤC (CLOTHING) =====
+            ("vietnamese traditional dress", "áo dài"),
+            ("conical hat", "nón lá"),
+            ("vietnamese traditional clothing", "trang phục truyền thống"),
+            ("ethnic costume", "trang phục dân tộc"),
+            ("vietnamese traditional shirt", "áo bà ba"),
+            ("thai headscarf", "khăn piêu"),
+            ("hmong traditional costume", "trang phục H'Mông"),
+            ("hue brocade dress", "áo gấm Huế"),
+            ("hue turban", "khăn đóng Huế"),
+            ("wooden shoes", "giày gỗ"),
+            ("four-panel dress", "áo tứ thân"),
+            ("traditional bra", "yếm đào"),
+            ("wedding ao dai", "áo dài cưới"),
+            ("chin strap hat", "nón quai thao"),
+            ("brocade fabric", "thổ cẩm"),
+            ("silk scarf", "khăn lụa"),
+            ("mens ao dai", "áo dài nam"),
+            ("childrens ao dai", "áo dài trẻ em"),
+            ("student ao dai", "áo dài học sinh"),
+            ("modern ao dai", "áo dài cách tân"),
+            ("hue conical hat", "nón lá Huế"),
+            ("poem hat", "nón bài thơ"),
+            ("southern checkered scarf", "khăn rằn Nam Bộ"),
+            ("traditional halter top", "áo yếm truyền thống"),
+            ("tay traditional costume", "trang phục Tày"),
+            ("nung traditional costume", "trang phục Nùng"),
+            ("muong traditional costume", "trang phục Mường"),
+            ("khmer traditional costume", "trang phục Khmer"),
+            ("cham traditional costume", "trang phục Chăm"),
+            ("cham sarong", "sarong Chăm"),
+            ("cham turban", "turban Chăm"),
+            ("ede traditional costume", "trang phục Ê Đê"),
+            ("co tu traditional costume", "trang phục Cơ Tu"),
+            ("dao traditional costume", "trang phục Dao"),
+            ("giay traditional costume", "trang phục Giáy"),
+            ("la chi traditional costume", "trang phục La Chí"),
+            ("brocade skirt", "váy thổ cẩm"),
+            ("brocade headscarf", "khăn thổ cẩm"),
+            ("brocade bag", "túi thổ cẩm"),
+            ("silver bracelet", "vòng tay bạc"),
+            ("silver necklace", "dây chuyền bạc"),
+            ("ethnic earrings", "khuyên tai dân tộc"),
+            ("hmong collar", "vòng cổ H'Mông"),
+            ("brocade belt", "thắt lưng thổ cẩm"),
+            # ===== 4. LỄ HỘI (FESTIVALS) =====
+            ("vietnamese new year", "Tết Nguyên Đán"),
+            ("cherry blossom festival", "lễ hội hoa anh đào"),
+            ("mid autumn festival", "Trung thu"),
+            ("hung kings festival", "lễ hội đền Hùng"),
+            ("hue festival", "festival Huế"),
+            ("perfume pagoda festival", "lễ hội chùa Hương"),
+            ("kate festival", "Kate festival"),
+            ("whale worship festival", "lễ hội cầu ngư"),
+            ("buffalo fighting festival", "lễ hội chọi trâu"),
+            ("sticky rice cake festival", "lễ hội bánh chưng"),
+            ("giong festival", "Gióng festival"),
+            ("village festival", "lễ hội làng"),
+            ("vietnamese wedding", "đám cưới Việt Nam"),
+            ("water festival", "lễ hội nước"),
+            ("harvest festival", "lễ hội harvest"),
+            ("vu lan festival", "Vu Lan festival"),
+            ("boat racing festival", "lễ hội đua thuyền"),
+            ("buffalo fighting festival", "lễ hội chọi trâu"),
+            ("rice harvest festival", "lễ hội hái lúa"),
+            ("thanksgiving festival", "lễ hội cúng ơn"),
+            ("ok om bok festival", "lễ hội Óc Om Bóc"),
+            ("don ta festival", "lễ hội Dôn Ta"),
+            ("khmer new year", "lễ hội Chaul Chnam Thmey"),
+            ("roong pooc festival", "lễ hội Roóng Poọc"),
+            ("nang hai festival", "lễ hội Nàng Hai"),
+            ("lion dance festival", "lễ hội múa lân"),
+            ("fireworks festival", "lễ hội pháo hoa"),
+            ("ban flower festival", "lễ hội hoa ban"),
+            ("coffee festival", "lễ hội café"),
+            ("con throwing festival", "lễ hội ném còn"),
+            ("love festival", "lễ hội tình yêu"),
+            ("vietnamese valentine", "Valentine Việt Nam"),
+            ("first full moon", "rằm tháng Giêng"),
+            ("cold food festival", "tết Hàn thực"),
+            ("doan ngo festival", "tết Đoan ngọ"),
+            ("seventh month full moon", "rằm tháng Bảy"),
+            ("mid autumn festival", "tết Trung thu"),
+            ("teachers day", "lễ 20/11"),
+            ("womens day", "lễ 8/3"),
+            ("hung kings commemoration", "lễ giỗ tổ Hùng Vương"),
+            ("national day", "lễ Quốc khánh"),
+            # ===== 5. THỦ CÔNG MỸ NGHỆ (HANDICRAFTS) =====
+            ("bat trang ceramics", "gốm sứ Bát Tràng"),
+            ("dong ho paintings", "tranh Đông Hồ"),
+            ("vietnamese embroidery", "thêu Việt Nam"),
+            ("weaving", "đan lát"),
+            ("bamboo weaving", "mây tre đan"),
+            ("vietnamese lacquer", "sơn mài Việt Nam"),
+            ("wood carving", "điêu khắc gỗ"),
+            ("hue ceramics", "gốm Huế"),
+            ("silk painting vietnam", "tranh lụa"),
+            ("bronze casting", "đúc đồng"),
+            ("stone carving", "chạm khắc"),
+            ("brocade weaving", "thổ cẩm dệt"),
+            ("chu dau ceramics", "gốm Chu Đậu"),
+            ("hue porcelain", "sứ Huế"),
+            ("phu lang ceramics", "gốm Phù Lãng"),
+            ("silk painting", "tranh lụa"),
+            ("lacquer painting", "tranh sơn mài"),
+            ("mother of pearl inlay", "khảm trai"),
+            ("wood sculpture", "tượng gỗ"),
+            ("stone sculpture", "tượng đá"),
+            ("bronze items", "đồ đồng"),
+            ("silver items", "đồ bạc"),
+            ("ethnic jewelry", "trang sức dân tộc"),
+            ("folk masks", "mặt nạ dân gian"),
+            ("water puppets", "rối nước"),
+            ("carpet weaving", "dệt thảm"),
+            ("sedge mat", "chiếu cói"),
+            ("handmade conical hat", "nón lá thủ công"),
+            ("incense making", "làm hương"),
+            ("do paper", "giấy dó"),
+            ("cake mold making", "làm bánh in"),
+            ("folk candy", "kẹo dân gian"),
+            # ===== 6. NHẠC CỤ (MUSICAL INSTRUMENTS) =====
+            ("vietnamese monochord", "đàn bầu"),
+            ("vietnamese drums", "trống"),
+            ("bamboo flute", "sáo trúc"),
+            ("vietnamese zither", "đàn tranh"),
+            ("moon lute", "đàn nguyệt"),
+            ("vietnamese pipa", "đàn tỳ bà"),
+            ("gourd trumpet", "kèn bầu"),
+            ("bronze gong", "cồng chiêng"),
+            ("two string fiddle", "đàn nhị"),
+            ("pan flute", "sáo điếu"),
+            ("rice drum", "trống cơm"),
+            ("vietnamese lute", "đàn đáy"),
+            ("vietnamese guitar", "đàn sến"),
+            ("36 string zither", "đàn tam thập lục"),
+            ("16 string zither", "đàn thập lục"),
+            ("leaf trumpet", "kèn lá"),
+            ("ethnic flute", "sáo mọi"),
+            ("ceremonial drum", "trống chầu"),
+            ("wooden bell", "mõ gỗ"),
+            ("temple bell", "chuông chùa"),
+            ("bronze cymbal", "chiêng đồng"),
+            ("kni string instrument", "đàn K'ni"),
+            ("trung bamboo xylophone", "đàn T'rưng"),
+            ("pi flute", "sáo pí"),
+            ("bronze drum", "trống đồng"),
+            ("single string instrument", "đàn bầu độc huyền"),
+            # ===== 7. PHONG CẢNH (LANDSCAPES) =====
+            ("ha long bay", "vịnh Hạ Long"),
+            ("sapa terraced fields", "ruộng bậc thang Sapa"),
+            ("mekong delta", "delta sông Mekong"),
+            ("hoan kiem lake", "Hồ Gươm"),
+            ("west lake hanoi", "Hồ Tây Hà Nội"),
+            ("phong nha cave", "Phong Nha cave"),
+            ("ba be lake", "Ba Be lake"),
+            ("mui ne sand dunes", "Mũi Né sand dunes"),
+            ("ninh binh landscape", "Ninh Bình landscape"),
+            ("tam coc", "Tam Cốc"),
+            ("hoi an ancient town", "Hội An ancient town"),
+            ("da lat hills", "Đà Lạt hills"),
+            ("can tho floating market", "Cần Thơ floating market"),
+            ("muong hoa valley", "Mường Hoa valley"),
+            ("ha long bay caves", "Hạ Long Bay caves"),
+            ("fansipan mountain", "núi Phan Xi Păng"),
+            ("dong van plateau", "cao nguyên Đồng Văn"),
+            ("cuc phuong national park", "vườn quốc gia Cúc Phương"),
+            ("u minh national park", "vườn quốc gia U Minh"),
+            ("phu quoc island", "đảo Phú Quốc"),
+            ("cat ba island", "đảo Cát Bà"),
+            ("thoi son islet", "cồn Thoi Son"),
+            ("moc chau tea hills", "đồi chè Mộc Châu"),
+            ("mui ne sand hills", "đồi cát Mũi Né"),
+            ("quy nhon beach", "biển Quy Nhon"),
+            ("nha trang beach", "biển Nha Trang"),
+            ("da nang beach", "biển Đà Nẵng"),
+            ("vung tau beach", "biển Vũng Tàu"),
+            ("ha long beach", "biển Hạ Long"),
+            ("sam son beach", "biển Sầm Sơn"),
+            ("red river", "sông Hồng"),
+            ("mekong river", "sông Mekong"),
+            ("perfume river", "sông Hương"),
+            ("thu bon river", "sông Thu Bồn"),
+            ("ban gioc waterfall", "thác Ban Giốc"),
+            ("can gio mangrove forest", "rừng ngập mặn Cần Giờ"),
+            ("tram chim forest", "rừng Tràm Chim"),
+            ("yok don national park", "vườn quốc gia Yok Đôn"),
+            # ===== 8. VĂN HÓA DÂN GIAN (FOLK CULTURE) =====
+            ("water puppet show", "múa rối nước"),
+            ("ca tru performance", "Ca trù performance"),
+            ("cheo opera", "Chèo opera"),
+            ("cai luong opera", "Cải lương"),
+            ("tuong classical opera", "Tuồng classical opera"),
+            ("vietnamese folklore", "văn hóa dân gian"),
+            ("dragon dance", "múa rồng"),
+            ("lion dance", "múa lân"),
+            ("traditional storytelling", "kể chuyện"),
+            ("vietnamese folk songs", "hát dân ca"),
+            ("quan ho singing", "quan họ singing"),
+            ("hat van ritual", "hát văn ritual"),
+            ("xam singing", "xẩm singing"),
+            ("folk tales vietnam", "folk tales Vietnam"),
+            ("thang long water puppets", "rối nước Thăng Long"),
+            ("traditional dance", "múa truyền thống"),
+            ("sap dance", "múa sạp"),
+            ("xoang dance", "múa xoang"),
+            ("shadow dance", "múa bóng rỗi"),
+            ("silk dance", "múa lụa"),
+            ("lullaby", "hát ru"),
+            ("bac ninh quan ho", "hát quan họ Bắc Ninh"),
+            ("chau van singing", "hát chầu văn"),
+            ("vi giam folk song", "ví giặm"),
+            ("ho khoan work song", "hò khoan"),
+            ("soong co singing", "hát soong cọ"),
+            ("quan ho folk song", "dân ca quan họ"),
+            ("xoan singing", "hát xoan"),
+            ("hue royal music", "ca Huế"),
+            ("nghe tinh folk song", "hò Nghệ Tĩnh"),
+            # ===== 9. GIAO THÔNG (TRANSPORTATION) =====
+            ("vietnamese motorbike", "xe máy"),
+            ("cyclo vietnam", "xích lô"),
+            ("motorbike taxi", "xe ôm"),
+            ("mekong boat", "thuyền Mekong"),
+            ("vietnamese train", "tàu hỏa"),
+            ("vietnamese transportation", "giao thông Việt Nam"),
+            ("traditional boat vietnam", "thuyền truyền thống"),
+            ("basket boat", "thúng chai"),
+            ("dragon boat vietnam", "thuyền rồng"),
+            ("vietnamese bus", "xe buýt"),
+            ("vietnamese taxi", "taxi"),
+            ("grab bike", "grab bike"),
+            ("electric vehicle vietnam", "xe điện"),
+            ("round boat", "thuyền thúng"),
+            ("cargo boat", "ghe bầu"),
+            ("kayak vietnam", "thuyền kayak"),
+            ("ox cart", "xe bò"),
+            ("buffalo cart", "xe trâu"),
+            ("palanquin vietnam", "kiệu"),
+            ("wedding palanquin", "kiệu hoa"),
+            ("three wheeler", "xe lam"),
+            ("ferry boat", "đò nang"),
+            # ===== 10. ĐỜI SỐNG HÀNG NGÀY (DAILY LIFE) =====
+            ("vietnamese market", "chợ Việt Nam"),
+            ("street food vietnam", "street food Vietnam"),
+            ("coffee shop vietnam", "coffee shop Vietnam"),
+            ("vietnamese family", "gia đình Việt Nam"),
+            ("vietnam daily life", "đời s���ng hàng ngày"),
+            ("rice farming vietnam", "rice farming Vietnam"),
+            ("fishing village vietnam", "fishing village Vietnam"),
+            ("vietnamese school", "trường học Việt Nam"),
+            ("traditional market", "chợ truyền thống"),
+            ("vietnamese wedding", "đám cưới Việt Nam"),
+            ("tet celebration family", "Tết gia đình"),
+            ("vietnamese kitchen", "nhà bếp Việt Nam"),
+            ("can tho floating market", "chợ nổi Cần Thơ"),
+            ("ben thanh market", "chợ Bến Thành"),
+            ("dong xuan market", "chợ Đồng Xuân"),
+            ("countryside market", "chợ quê"),
+            ("craft village vietnam", "làng nghề"),
+            ("pottery village", "làng gốm"),
+            ("weaving village", "làng dệt"),
+            ("fishing village", "làng chài"),
+            ("vietnamese farmer", "nông dân"),
+            ("rice harvest", "thu hoạch lúa"),
+            ("rice planting", "cấy lúa"),
+            ("rice threshing", "đập lúa"),
+            ("shrimp farming", "nuôi tôm"),
+            ("fish farming", "nuôi cá"),
+            ("buffalo herding", "chăn trâu bò"),
+            ("duck herding", "chăn vịt"),
+            ("family meal", "bữa cơm gia đình"),
+            ("ancestor altar", "bàn thờ gia tiên"),
+            ("vietnamese student", "học sinh Việt Nam"),
+            ("classroom", "lớp học"),
+            ("playground", "sân chơi"),
+            ("vietnamese neighborhood", "khu phố"),
+            ("sidewalk cafe", "quán cà phê vỉa hè"),
+            ("street food stall", "quán ăn đường phố"),
+            ("street vendor", "xe hàng rong"),
+            ("daily work", "công việc hàng ngày"),
+            ("rural life", "sinh hoạt làng quê"),
+            ("city life", "đời sống thành phố"),
+            # ===== 11. TRÒ CHƠI DÂN GIAN (TRADITIONAL GAMES) =====
+            ("tug of war vietnam", "kéo co"),
+            ("shuttlecock kicking", "đá cầu"),
+            ("bamboo dancing", "nhảy sạp"),
+            ("kite flying", "thả diều"),
+            ("o an quan game", "ô ăn quan"),
+            ("blind mans bluff", "bịt mắt bắt dê"),
+            ("stick hitting game", "đánh khăng"),
+            ("pot breaking game", "đập niêu"),
+            ("buffalo fighting", "chọi trâu"),
+            ("swing game", "đu tiên"),
+            ("vietnamese traditional games", "trò chơi dân gian"),
+            ("village wrestling", "hội vật làng"),
+            ("traditional jump rope", "nhảy dây truyền thống"),
+            ("bamboo spinning top", "đánh quay tre"),
+            ("bamboo ring throwing", "thả vòng tre"),
+            ("con throwing", "tung còn"),
+            ("traditional wrestling", "vật truyền thống"),
+            ("cockfighting vietnam", "chọi gà"),
+            ("spinning top vietnam", "đánh quay"),
+            ("hide and seek vietnam", "trốn tìm"),
+            ("stilts walking", "đi cà kheo"),
+            ("shuttlecock passing", "chơi chuyền"),
+            ("badminton throwing", "ném gà bông"),
+            ("marble shooting", "bắn bi"),
+            ("hopscotch vietnam", "chơi lò cò"),
+            ("tree climbing", "trèo cây"),
+            ("river swimming", "bơi sông"),
+            ("boat racing", "đua thuyền"),
+            ("dragon dancing", "múa rồng"),
+            ("children lion dance", "múa lân trẻ em"),
+            ("drum playing", "đánh trống"),
+            ("flute playing", "thổi kèn"),
+            ("instrument playing", "chơi đàn"),
+            ("storytelling", "kể chuyện"),
+            ("poetry reciting", "đọc thơ"),
+            # ===== 12. THỂ THAO TRUYỀN THỐNG (TRADITIONAL SPORTS) =====
+            ("dragon boat racing vietnam", "đua thuyền rồng"),
+            ("vietnamese traditional wrestling", "vật cổ truyền"),
+            ("stick pushing", "đẩy gậy"),
+            ("crossbow shooting", "bắn nỏ"),
+            ("sepak takraw vietnam", "cầu mây"),
+            ("vietnamese martial arts", "võ cổ truyền"),
+            ("lion dragon competition", "lân sư rồng thi đấu"),
+            ("vietnamese chess", "cờ tướng"),
+            ("traditional stick fighting", "đánh gậy truyền thống"),
+            ("ghe ngo boat racing", "đua ghe ngo"),
+            ("bay nui ox racing", "đua bò Bảy Núi"),
+            ("ha long kayak racing", "đua thuyền kayak Hạ Long"),
+            ("vovinam demonstration", "vovinam biểu diễn"),
+            ("vietnamese boxing", "muay Việt Nam"),
+            ("binh dinh martial arts", "võ Bình Định"),
+            ("tay son martial arts", "võ Tây Sơn"),
+            ("traditional weapons", "kim khí"),
+            ("nunchaku", "côn nhị khúc"),
+            ("tai chi vietnam", "thái cực quyền"),
+            ("boxing vietnam", "quy���n anh"),
+            ("judo vietnam", "judo"),
+            ("wrestling vietnam", "đấu vật"),
+            ("weightlifting vietnam", "cử tạ"),
+            ("swimming vietnam", "bơi lội"),
+            ("cycling racing", "đua xe đạp"),
+            ("marathon vietnam", "marathon"),
+            ("badminton vietnam", "cầu lông"),
+            ("tennis vietnam", "tennis"),
+            ("table tennis vietnam", "bóng bàn"),
+            ("karate vietnam", "karatedo"),
+            ("taekwondo vietnam", "taekwondo"),
+            ("football vietnam", "bóng đá"),
+            ("beach volleyball vietnam", "bóng chuyền bãi biển"),
+            ("street basketball vietnam", "bóng rổ đường phố"),
+            ("athletics sea games vietnam", "điền kinh SEA Games"),
+            ("kickboxing vietnam", "kickboxing"),
+            ("mma vietnam", "MMA"),
+            ("gymnastics vietnam", "gymnastics"),
+            ("diving vietnam", "diving"),
+            # ===== GENERAL CULTURAL TERMS =====
+            ("vietnamese culture", "văn hóa Việt Nam"),
+            ("traditional festival", "lễ hội truyền thống"),
+            ("vietnamese tradition", "truyền thống Việt Nam"),
+            ("vietnamese heritage", "di sản Việt Nam"),
+            ("folk culture", "văn hóa dân gian"),
+            ("traditional art", "nghệ thuật truyền thống"),
+            ("vietnamese customs", "phong tục Việt Nam"),
+            ("cultural performance", "biểu diễn văn hóa"),
+            ("ethnic minority", "dân tộc thiểu số"),
+            ("cultural identity", "bản sắc văn hóa"),
+        ]
+        # Extract English terms for CLIP detection
+        english_terms = [pair[0] for pair in vocabulary_pairs]
+        vietnamese_terms = [pair[1] for pair in vocabulary_pairs]
+        # Store mapping for result translation
+        self.en_to_vi_mapping = dict(vocabulary_pairs)
+        logger.info(f"Loaded comprehensive cultural vocabulary: {len(english_terms)} items across 12 categories")
+        return english_terms
+    def detect_objects(self, image: Image.Image, threshold: float = 0.15) -> List[str]:
+        """Detect cultural objects in image using CLIP - IMPROVED"""
+        try:
+            # Prepare image and text inputs - MULTIPLE TEMPLATES
+            templates = [
+                "a photo of {}",
+                "an image showing {}",
+                "{}",
+                "traditional {}",
+                "vietnamese {}"
+            ]
+            all_text_inputs = []
+            all_labels = []
+            for obj in self.cultural_vocabulary:
+                for template in templates:
+                    text_input = template.format(obj)
+                    all_text_inputs.append(text_input)
+                    all_labels.append(obj)
+            # Process in batches to avoid memory issues
+            batch_size = 50
+            all_probs = []
+            for i in range(0, len(all_text_inputs), batch_size):
+                batch_texts = all_text_inputs[i:i+batch_size]
+            inputs = self.clip_processor(
+                    text=batch_texts,
+                images=image,
+                return_tensors="pt",
+                padding=True
+            ).to(self.device)
+            # Get predictions
+            with torch.no_grad():
+                outputs = self.clip_model(**inputs)
+                logits_per_image = outputs.logits_per_image
+                probs = logits_per_image.softmax(dim=1)
+                all_probs.extend(probs[0].cpu().numpy())
+            # Group probabilities by object (average across templates)
+            object_probs = {}
+            for i, (prob, label) in enumerate(zip(all_probs, all_labels)):
+                if label not in object_probs:
+                    object_probs[label] = []
+                object_probs[label].append(prob)
+            # Average probabilities and filter
+            detected_objects = []
+            for obj, probs in object_probs.items():
+                avg_prob = np.mean(probs)
+                max_prob = np.max(probs)
+                # Use both average and max for decision
+                final_score = (avg_prob * 0.3 + max_prob * 0.7)
+                if final_score > threshold:
+                    # Translate back to Vietnamese
+                    vietnamese_name = self.en_to_vi_mapping.get(obj, obj)
+                    detected_objects.append(vietnamese_name)
+                    logger.debug(f"Detected {obj} -> {vietnamese_name} (score: {final_score:.3f})")
+            return detected_objects
+        except Exception as e:
+            logger.warning(f"Object detection failed: {e}")
+            return []