"""
Shared Entity Extraction Utilities for BeatDebate Agents

Consolidates entity extraction patterns that are duplicated across agents,
providing a unified approach to entity recognition and processing.
"""

import re
from typing import Dict, List, Any, Optional, Set
import structlog

logger = structlog.get_logger(__name__)


class EntityExtractionUtils:
    """
    Shared utilities for entity extraction across all agents.
    
    Consolidates:
    - Artist name extraction
    - Genre identification
    - Track/album extraction
    - Mood/context detection
    - Entity validation and cleaning
    """
    
    def __init__(self):
        """Initialize entity extraction utilities."""
        self.logger = logger.bind(component="EntityExtractionUtils")
        
        # Common genre patterns
        self.genre_patterns = {
            'rock': ['rock', 'alternative', 'indie rock', 'punk', 'grunge', 'metal'],
            'electronic': ['electronic', 'edm', 'techno', 'house', 'ambient', 'synth'],
            'pop': ['pop', 'mainstream', 'chart', 'commercial'],
            'hip hop': ['hip hop', 'rap', 'hip-hop', 'hiphop'],
            'jazz': ['jazz', 'bebop', 'swing', 'fusion'],
            'classical': ['classical', 'orchestra', 'symphony', 'baroque'],
            'folk': ['folk', 'acoustic', 'singer-songwriter'],
            'r&b': ['r&b', 'soul', 'funk', 'motown'],
            'country': ['country', 'bluegrass', 'americana'],
            'reggae': ['reggae', 'ska', 'dub']
        }
        
        # Common mood indicators
        self.mood_patterns = {
            'energetic': ['energetic', 'upbeat', 'high energy', 'pumped', 'intense'],
            'calm': ['calm', 'peaceful', 'relaxing', 'chill', 'mellow'],
            'melancholic': ['sad', 'melancholic', 'depressing', 'somber', 'moody'],
            'happy': ['happy', 'joyful', 'uplifting', 'cheerful', 'positive'],
            'aggressive': ['aggressive', 'angry', 'intense', 'heavy', 'brutal'],
            'romantic': ['romantic', 'love', 'intimate', 'sensual'],
            'nostalgic': ['nostalgic', 'vintage', 'retro', 'classic']
        }
        
        # Context/activity patterns
        self.context_patterns = {
            'work': ['work', 'coding', 'study', 'focus', 'concentration', 'productivity'],
            'exercise': ['workout', 'gym', 'running', 'exercise', 'fitness', 'training'],
            'party': ['party', 'dance', 'club', 'celebration', 'social'],
            'relax': ['relax', 'chill', 'unwind', 'rest', 'leisure'],
            'driving': ['driving', 'road trip', 'car', 'travel'],
            'sleep': ['sleep', 'bedtime', 'night', 'lullaby']
        }
        
        # Common artist indicators
        self.artist_indicators = [
            'by', 'from', 'artist', 'band', 'singer', 'musician',
            'like', 'similar to', 'sounds like', 'reminds me of'
        ]
    
    def extract_artists_from_text(self, text: str) -> Dict[str, List[str]]:
        """
        Extract artist names from text using pattern matching.
        
        Args:
            text: Input text to extract artists from
            
        Returns:
            Dictionary with 'primary' and 'similar_to' artist lists
        """
        text_lower = text.lower()
        artists = {'primary': [], 'similar_to': []}
        
        # Pattern 1: "by [artist]" or "from [artist]"
        by_pattern = r'\b(?:by|from)\s+([A-Za-z0-9\s&\-\'\.]+?)(?:\s*$|,|\?|!|\s+(?:music|songs?|tracks?|bands?|artists?))'
        by_matches = re.findall(by_pattern, text, re.IGNORECASE)
        for match in by_matches:
            artist = self._clean_artist_name(match)
            if artist and self._is_valid_artist_name(artist):
                artists['primary'].append(artist)
        
        # Pattern 2: "like [artist]" or "similar to [artist]"
        similar_pattern = (r'\b(?:like|similar to|sounds like|reminds me of)\s+'
                          r'([A-Za-z0-9\s&\-\'\.]+?)'
                          r'(?:\s*$|,|\?|!|\s+(?:but|music|songs?|tracks?|bands?|artists?|with|and|or|that|who|which))')
        similar_matches = re.findall(similar_pattern, text, re.IGNORECASE)
        for match in similar_matches:
            artist = self._clean_artist_name(match)
            if artist and self._is_valid_artist_name(artist):
                artists['similar_to'].append(artist)
        
        # Pattern 3: Quoted artist names
        quoted_pattern = r'["\']([A-Za-z0-9\s&\-\'\.]+?)["\']'
        quoted_matches = re.findall(quoted_pattern, text)
        for match in quoted_matches:
            artist = self._clean_artist_name(match)
            if artist and self._is_valid_artist_name(artist):
                # Check if it's in a similarity context
                match_pos = text_lower.find(match.lower())
                context_before = text_lower[max(0, match_pos-20):match_pos]
                if any(indicator in context_before for indicator in ['like', 'similar']):
                    artists['similar_to'].append(artist)
                else:
                    artists['primary'].append(artist)
        
        # Remove duplicates while preserving order
        artists['primary'] = list(dict.fromkeys(artists['primary']))
        artists['similar_to'] = list(dict.fromkeys(artists['similar_to']))
        
        self.logger.debug(
            "Artists extracted from text",
            primary_count=len(artists['primary']),
            similar_count=len(artists['similar_to']),
            text_length=len(text)
        )
        
        return artists
    
    def extract_genres_from_text(self, text: str) -> Dict[str, List[str]]:
        """
        Extract genres from text using pattern matching.
        
        Args:
            text: Input text to extract genres from
            
        Returns:
            Dictionary with 'primary' and 'secondary' genre lists
        """
        text_lower = text.lower()
        genres = {'primary': [], 'secondary': []}
        
        # Check for explicit genre mentions
        for main_genre, variations in self.genre_patterns.items():
            for variation in variations:
                if variation in text_lower:
                    # Determine if it's primary or secondary based on context
                    if self._is_primary_genre_mention(text_lower, variation):
                        if main_genre not in genres['primary']:
                            genres['primary'].append(main_genre)
                    else:
                        if main_genre not in genres['secondary'] and main_genre not in genres['primary']:
                            genres['secondary'].append(main_genre)
        
        # Pattern matching for genre-like words
        genre_pattern = r'\b([a-z]+(?:\s+[a-z]+)*)\s+(?:music|genre|style|sound)\b'
        genre_matches = re.findall(genre_pattern, text_lower)
        for match in genre_matches:
            cleaned_genre = match.strip()
            if cleaned_genre and len(cleaned_genre) > 2:
                if cleaned_genre not in genres['primary'] and cleaned_genre not in genres['secondary']:
                    genres['secondary'].append(cleaned_genre)
        
        self.logger.debug(
            "Genres extracted from text",
            primary_count=len(genres['primary']),
            secondary_count=len(genres['secondary'])
        )
        
        return genres
    
    def extract_moods_from_text(self, text: str) -> Dict[str, List[str]]:
        """
        Extract mood indicators from text.
        
        Args:
            text: Input text to extract moods from
            
        Returns:
            Dictionary with 'primary' mood list
        """
        text_lower = text.lower()
        moods = {'primary': []}
        
        # Check for explicit mood mentions
        for mood, indicators in self.mood_patterns.items():
            for indicator in indicators:
                if indicator in text_lower:
                    if mood not in moods['primary']:
                        moods['primary'].append(mood)
        
        # Pattern matching for mood adjectives
        mood_pattern = r'\b(feel|feeling|mood|vibe|atmosphere)\s+([a-z]+)\b'
        mood_matches = re.findall(mood_pattern, text_lower)
        for _, mood_word in mood_matches:
            if mood_word and len(mood_word) > 3:
                if mood_word not in moods['primary']:
                    moods['primary'].append(mood_word)
        
        self.logger.debug(
            "Moods extracted from text",
            mood_count=len(moods['primary'])
        )
        
        return moods
    
    def extract_context_from_text(self, text: str) -> List[str]:
        """
        Extract context/activity indicators from text.
        
        Args:
            text: Input text to extract context from
            
        Returns:
            List of context indicators
        """
        text_lower = text.lower()
        contexts = []
        
        # Check for explicit context mentions
        for context, indicators in self.context_patterns.items():
            for indicator in indicators:
                if indicator in text_lower:
                    if context not in contexts:
                        contexts.append(context)
        
        # Pattern matching for activity contexts
        activity_pattern = r'\b(?:for|while|during)\s+([a-z]+(?:\s+[a-z]+)*)\b'
        activity_matches = re.findall(activity_pattern, text_lower)
        for match in activity_matches:
            cleaned_activity = match.strip()
            if cleaned_activity and len(cleaned_activity) > 2:
                if cleaned_activity not in contexts:
                    contexts.append(cleaned_activity)
        
        self.logger.debug(
            "Contexts extracted from text",
            context_count=len(contexts)
        )
        
        return contexts
    
    def extract_tracks_from_text(self, text: str) -> Dict[str, List[str]]:
        """
        Extract track/song names from text.
        
        Args:
            text: Input text to extract tracks from
            
        Returns:
            Dictionary with 'primary' track list
        """
        tracks = {'primary': []}
        
        # Pattern 1: "song [title]" or "track [title]"
        song_pattern = r'\b(?:song|track)\s+["\']?([A-Za-z0-9\s&\-\'\.]+?)["\']?(?:\s|$|,|\.|\?|!)'
        song_matches = re.findall(song_pattern, text, re.IGNORECASE)
        for match in song_matches:
            track = self._clean_track_name(match)
            if track and self._is_valid_track_name(track):
                tracks['primary'].append(track)
        
        # Pattern 2: Quoted titles that might be songs
        quoted_pattern = r'["\']([A-Za-z0-9\s&\-\'\.]+?)["\']'
        quoted_matches = re.findall(quoted_pattern, text)
        for match in quoted_matches:
            track = self._clean_track_name(match)
            if track and self._is_valid_track_name(track):
                # Check if it's in a song context
                match_pos = text.lower().find(match.lower())
                context_before = text.lower()[max(0, match_pos-20):match_pos]
                context_after = text.lower()[match_pos:match_pos+len(match)+20]
                if any(indicator in context_before + context_after for indicator in ['song', 'track', 'play']):
                    tracks['primary'].append(track)
        
        # Remove duplicates
        tracks['primary'] = list(dict.fromkeys(tracks['primary']))
        
        self.logger.debug(
            "Tracks extracted from text",
            track_count=len(tracks['primary'])
        )
        
        return tracks
    
    def validate_and_enhance_entities(
        self, entities: Dict[str, Any], original_text: str
    ) -> Dict[str, Any]:
        """
        Validate and enhance extracted entities.
        
        Args:
            entities: Extracted entities dictionary
            original_text: Original text for context
            
        Returns:
            Enhanced entities dictionary
        """
        enhanced_entities = entities.copy()
        
        # Ensure musical_entities structure exists
        if 'musical_entities' not in enhanced_entities:
            enhanced_entities['musical_entities'] = {}
        
        musical_entities = enhanced_entities['musical_entities']
        
        # Helper to process entity categories (artists, genres, tracks, moods)
        def process_entity_category(entity_type_plural: str, cleaner_func, validator_func):
            if entity_type_plural not in musical_entities:
                # If not present at all, extract fresh from original_text
                if entity_type_plural == 'artists':
                    musical_entities[entity_type_plural] = self.extract_artists_from_text(original_text)
                elif entity_type_plural == 'genres':
                    musical_entities[entity_type_plural] = self.extract_genres_from_text(original_text)
                elif entity_type_plural == 'tracks':
                    musical_entities[entity_type_plural] = self.extract_tracks_from_text(original_text)
                elif entity_type_plural == 'moods':
                    musical_entities[entity_type_plural] = self.extract_moods_from_text(original_text)
            
            # Now process/validate the entity categories
            entity_categories_dict = musical_entities[entity_type_plural]
            if isinstance(entity_categories_dict, dict):
                for category_key in entity_categories_dict:
                    current_items = entity_categories_dict[category_key]
                    if isinstance(current_items, list):
                        processed_item_list = []
                        for item_entry in current_items:
                            if isinstance(item_entry, str):
                                # Simple string entry
                                cleaned_name_str = cleaner_func(item_entry)
                                if validator_func(cleaned_name_str):
                                    processed_item_list.append(cleaned_name_str)
                            elif isinstance(item_entry, dict):
                                # Dict entry with possible confidence scores
                                item_name_to_clean = item_entry.get('name')
                                original_confidence = item_entry.get('confidence', 0.5)
                                
                                if isinstance(item_name_to_clean, str):
                                    cleaned_name_str = cleaner_func(item_name_to_clean)
                                    if validator_func(cleaned_name_str):
                                        processed_item_list.append({
                                            'name': cleaned_name_str,
                                            'confidence': original_confidence 
                                        })
                                elif item_name_to_clean is None and isinstance(item_entry, dict):
                                    self.logger.warning(f"{entity_type_plural} entry object is a dict but missing 'name'", entity_entry=item_entry)
                                elif item_name_to_clean is not None:  # Should be string or None by now
                                    self.logger.warning(f"Unexpected type for {entity_type_plural} name_to_clean", type_found=type(item_name_to_clean))
                        entity_categories_dict[category_key] = processed_item_list
                    else:
                        self.logger.warning(f"Expected list for {entity_type_plural}.{category_key}, got {type(current_items)}")
        
        # Process each entity type
        process_entity_category('artists', self._clean_artist_name, self._is_valid_artist_name)
        process_entity_category('genres', lambda x: x.strip().lower(), lambda x: len(x) > 1)  # Simpler cleaning for genres
        process_entity_category('tracks', self._clean_track_name, self._is_valid_track_name)
        
        # Handle moods (they follow the same structure)
        if 'moods' not in musical_entities:
            musical_entities['moods'] = self.extract_moods_from_text(original_text)
        else:
            # Process moods with basic cleaning
            process_entity_category('moods', lambda x: x.strip().lower(), lambda x: len(x) > 1)
        
        # Add context information
        if 'context_factors' not in enhanced_entities:
            enhanced_entities['context_factors'] = self.extract_context_from_text(original_text)
        
        # Add confidence scores (this will convert remaining string items to dicts)
        enhanced_entities = self._add_confidence_scores(enhanced_entities, original_text)
        
        self.logger.debug(
            "Entities validated and enhanced",
            total_artists=len(musical_entities.get('artists', {}).get('primary', [])),
            total_genres=len(musical_entities.get('genres', {}).get('primary', [])),
            total_moods=len(musical_entities.get('moods', {}).get('primary', [])),
            total_tracks=len(musical_entities.get('tracks', {}).get('primary', []))
        )
        
        return enhanced_entities
    
    def _clean_artist_name(self, artist: str) -> str:
        """Clean and normalize artist name."""
        if not artist:
            return ""
        
        # Remove extra whitespace and common artifacts
        cleaned = re.sub(r'\s+', ' ', artist.strip())
        
        # Remove common prefixes/suffixes that aren't part of artist names
        cleaned = re.sub(r'^(the\s+)?', '', cleaned, flags=re.IGNORECASE)
        cleaned = re.sub(r'\s+(band|group|artist|singer)$', '', cleaned, flags=re.IGNORECASE)
        
        # Remove punctuation at the end
        cleaned = re.sub(r'[.,!?;]+$', '', cleaned)
        
        return cleaned.strip()
    
    def _clean_track_name(self, track: str) -> str:
        """Clean and normalize track name."""
        if not track:
            return ""
        
        # Remove extra whitespace
        cleaned = re.sub(r'\s+', ' ', track.strip())
        
        # Remove punctuation at the end
        cleaned = re.sub(r'[.,!?;]+$', '', cleaned)
        
        return cleaned.strip()
    
    def _is_valid_artist_name(self, artist: str) -> bool:
        """Check if artist name is valid."""
        if not artist or len(artist) < 2:
            return False
        
        # Check for common non-artist words
        invalid_words = {
            'music', 'song', 'track', 'album', 'genre', 'style', 'sound',
            'playlist', 'radio', 'station', 'channel', 'video', 'audio',
            'listen', 'play', 'hear', 'find', 'search', 'recommend',
            'good', 'great', 'best', 'new', 'old', 'popular', 'famous'
        }
        
        artist_lower = artist.lower()
        if artist_lower in invalid_words:
            return False
        
        # Check if it's mostly numbers or special characters
        if re.match(r'^[0-9\s\-_\.]+$', artist):
            return False
        
        return True
    
    def _is_valid_track_name(self, track: str) -> bool:
        """Check if track name is valid."""
        if not track or len(track) < 2:
            return False
        
        # Check for common non-track words
        invalid_words = {
            'music', 'artist', 'band', 'singer', 'musician', 'genre',
            'style', 'sound', 'playlist', 'radio', 'station', 'channel'
        }
        
        track_lower = track.lower()
        if track_lower in invalid_words:
            return False
        
        return True
    
    def _is_primary_genre_mention(self, text: str, genre: str) -> bool:
        """Determine if genre mention is primary based on context."""
        genre_pos = text.find(genre)
        if genre_pos == -1:
            return False
        
        # Check context around the genre mention
        context_before = text[max(0, genre_pos-30):genre_pos]
        context_after = text[genre_pos:genre_pos+len(genre)+30]
        
        # Primary indicators
        primary_indicators = ['want', 'need', 'looking for', 'find', 'recommend', 'love', 'like']
        
        # Secondary indicators
        secondary_indicators = ['also', 'maybe', 'sometimes', 'occasionally', 'similar']
        
        context = context_before + context_after
        
        if any(indicator in context for indicator in primary_indicators):
            return True
        elif any(indicator in context for indicator in secondary_indicators):
            return False
        
        # Default to primary if no clear indicators
        return True
    
    def _add_confidence_scores(
        self, entities: Dict[str, Any], original_text: str
    ) -> Dict[str, Any]:
        """Add confidence scores to extracted entities."""
        text_length = len(original_text)
        
        # Calculate confidence based on text length and entity specificity
        base_confidence = min(0.8, 0.3 + (text_length / 200))
        
        musical_entities = entities.get('musical_entities', {})
        
        # Helper function to process entity categories
        def add_confidence_to_category(category_name: str, confidence_modifier: float = 1.0):
            if category_name in musical_entities:
                for category_key in musical_entities[category_name]:
                    category_list = musical_entities[category_name][category_key]
                    if isinstance(category_list, list):
                        for i, item in enumerate(category_list):
                            if isinstance(item, dict):
                                # Item already has confidence, preserve it unless it's too low
                                if item.get('confidence', 0) < 0.3:
                                    item['confidence'] = base_confidence * confidence_modifier
                            elif isinstance(item, str):
                                # Convert string to dict with confidence
                                item_confidence = base_confidence * confidence_modifier
                                if category_name == 'artists':
                                    # Higher confidence for longer, more specific artist names
                                    item_confidence += (len(item) / 100)
                                
                                category_list[i] = {
                                    'name': item,
                                    'confidence': min(0.95, item_confidence)
                                }
                            else:
                                # Handle unexpected item types
                                self.logger.warning(
                                    f"Unexpected item type in {category_name}.{category_key}",
                                    item_type=type(item),
                                    item=str(item)
                                )
        
        # Add confidence to each entity type
        add_confidence_to_category('artists', 1.0)
        add_confidence_to_category('genres', 1.0)
        add_confidence_to_category('moods', 1.0)
        add_confidence_to_category('tracks', 1.0)
        
        return entities
    
    def extract_similarity_indicators(self, text: str) -> Dict[str, Any]:
        """
        Extract similarity indicators and comparison patterns.
        
        Args:
            text: Input text to analyze
            
        Returns:
            Dictionary with similarity information
        """
        text_lower = text.lower()
        similarity_info = {
            'has_similarity_request': False,
            'similarity_type': None,
            'comparison_artists': [],
            'similarity_strength': 'medium'
        }
        
        # Check for similarity patterns
        similarity_patterns = [
            r'\b(?:like|similar to|sounds like|reminds me of)\s+([A-Za-z0-9\s&\-\'\.]+)',
            r'\b(?:in the style of|influenced by)\s+([A-Za-z0-9\s&\-\'\.]+)',
            r'\b(?:comparable to|along the lines of)\s+([A-Za-z0-9\s&\-\'\.]+)'
        ]
        
        for pattern in similarity_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                similarity_info['has_similarity_request'] = True
                for match in matches:
                    artist = self._clean_artist_name(match)
                    if artist and self._is_valid_artist_name(artist):
                        similarity_info['comparison_artists'].append(artist)
        
        # Determine similarity type
        if 'exactly like' in text_lower or 'just like' in text_lower:
            similarity_info['similarity_type'] = 'exact'
            similarity_info['similarity_strength'] = 'high'
        elif 'somewhat like' in text_lower or 'kind of like' in text_lower:
            similarity_info['similarity_type'] = 'loose'
            similarity_info['similarity_strength'] = 'low'
        elif similarity_info['has_similarity_request']:
            similarity_info['similarity_type'] = 'moderate'
            similarity_info['similarity_strength'] = 'medium'
        
        return similarity_info