"""
Acronym and abbreviation checker.

Validates that:
- Acronyms found in text have corresponding full forms defined
- Acronyms are used after their definition
- Only checks acronyms that have matching full forms in the document
"""
import re
from typing import List, Dict, Set, Tuple
from collections import defaultdict

from .base import BaseChecker, CheckResult, CheckSeverity


class AcronymChecker(BaseChecker):
    """Check acronym definitions and consistency."""
    
    name = "acronym"
    display_name = "Acronyms"
    description = "Check acronym definitions and consistent usage"
    
    # Enhanced pattern to find defined acronyms with LaTeX formatting support
    # Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc.
    DEFINITION_PATTERN = re.compile(
        r'([A-Z][a-zA-Z\s\-]+)[\s~]*\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?\)|'  # Full Name (ABC) or Full Name (\textbf{ABC}) with optional ~
        r'\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)\)',  # (ABC; Full Name) or (\textbf{ABC}; Full Name)
        re.MULTILINE
    )
    
    # Pattern to find standalone acronyms (3+ capital letters)
    ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b')
    
    # Comprehensive list of common acronyms that don't need definition
    COMMON_ACRONYMS = {
        # Hardware & Computing
        'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS',
        'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP',
        'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN',
        
        # File Formats & Standards
        'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL',
        'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP',
        
        # AI & Machine Learning (General)
        'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU',
        'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM',
        'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS',
        
        # NLP & Language Models
        'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA',
        'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT',
        'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER',
        
        # Computer Vision
        'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG',
        'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR',
        
        # Reinforcement Learning
        'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP',
        'POMDP', 'RLHF', 'RLAIF',
        
        # Metrics & Evaluation
        'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE',
        'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS',
        
        # Data & Statistics
        'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC',
        'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF',
        
        # Academic & Organizations
        'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS',
        'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV',
        'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD',
        'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT',
        
        # Methods & Techniques (Common in ML papers)
        'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL',
        'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL',
        'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO',
        
        # Misc
        'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA',
        'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT',
        'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC',
        'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA',
    }
    
    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
        results = []
        
        # Remove comments using base class method
        content = self._remove_comments(tex_content)
        
        # Find all defined acronyms with their positions
        defined_acronyms = self._find_definitions(content)
        
        # Find all acronym usages (excluding special contexts)
        all_usages = self._find_all_usages(content)
        
        # NEW: Find potential full forms for each acronym
        acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
        
        # Check for undefined acronyms (only those with matching full forms)
        for acronym, positions in all_usages.items():
            if acronym in self.COMMON_ACRONYMS:
                continue
            
            # Skip if no matching full form found in document
            if acronym not in acronym_full_forms:
                continue
            
            if acronym not in defined_acronyms:
                # First usage should define it
                first_pos = positions[0]
                line_num = self._find_line_number(content, first_pos)
                full_form = acronym_full_forms[acronym]
                
                # Check if full form is present in the same line (loose definition check)
                # This handles cases like: "The Unified Modeling Language (UML) is..." where regex missed it
                # or "UML (Unified Modeling Language)" or just "Unified Modeling Language ... UML"
                line_content = self._get_line_content(content, line_num)
                
                # Check if full form is in line content (ignoring case)
                if full_form.lower() in line_content.lower():
                    continue
                
                results.append(self._create_result(
                    passed=False,
                    severity=CheckSeverity.WARNING,
                    message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')",
                    line_number=line_num,
                    suggestion=f"Define on first use: '{full_form} ({acronym})'"
                ))
            else:
                # Check if used before definition
                def_pos = defined_acronyms[acronym]
                
                # Get the line number of the definition
                def_line_num = self._find_line_number(content, def_pos)
                
                for pos in positions:
                    if pos < def_pos:
                        line_num = self._find_line_number(content, pos)
                        
                        # Special case: if usage is on the same line as definition, it might be the definition itself
                        # (e.g. if the regex matched slightly later than the acronym usage starts?)
                        # But typically DEFINITION_PATTERN captures the whole block.
                        # However, if we have "The Unified Modeling Language (UML)..." and usage finds "UML" 
                        # technically "UML" inside "(UML)" is usage?
                        # `_find_all_usages` excludes special contexts like `(ACRONYM)`.
                        # So if we are here, it's a usage outside of parens.
                        
                        # If usage is on the same line as definition, let's look closer.
                        if line_num == def_line_num:
                            # It's likely fine if on same line
                            continue
                            
                        results.append(self._create_result(
                            passed=False,
                            severity=CheckSeverity.WARNING,
                            message=f"Acronym '{acronym}' used before definition",
                            line_number=line_num,
                            suggestion="Move definition before first use"
                        ))
                        break
        
        return results
    
    def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]:
        """Find potential full forms for acronyms by matching capital letters."""
        full_forms = {}
        
        for acronym in acronyms:
            if acronym in self.COMMON_ACRONYMS:
                continue
            
            # Build regex pattern to match full form
            # For "ABC", match words starting with A, B, C
            acronym_clean = acronym.rstrip('s')  # Remove plural
            if len(acronym_clean) < 3:
                continue
            
            # Create pattern: match sequence of words where first letters spell the acronym
            # Allow optional words in between (like "of", "the", "and")
            pattern_parts = []
            for i, letter in enumerate(acronym_clean):
                if i == 0:
                    # First word must start with the letter
                    pattern_parts.append(f'{letter}[a-z]+')
                else:
                    # Subsequent words: allow optional filler words
                    pattern_parts.append(f'(?:\\s+(?:of|the|and|for|in|on|with|to)\\s+)?\\s+{letter}[a-z]+')
            
            full_pattern = r'\b' + ''.join(pattern_parts) + r'\b'
            
            try:
                matches = re.finditer(full_pattern, content, re.IGNORECASE)
                for match in matches:
                    candidate = match.group(0)
                    
                    # Skip if candidate contains common non-content words
                    # These words indicate the match is part of a sentence, not an acronym full form
                    excluded_words = {
                        'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
                        'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from',
                        'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
                        'can', 'could', 'may', 'might', 'must', 'shall',
                        'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where',
                        'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much',
                        'more', 'most', 'less', 'few', 'several', 'other', 'another'
                    }
                    
                    candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower())
                    if any(word in excluded_words for word in candidate_words):
                        continue
                    
                    # Verify: extract first letters and check if they match acronym
                    words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE)
                    # Filter out filler words (allowed in between but not counted)
                    filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'}
                    meaningful_words = [w for w in words if w.lower() not in filler_words]
                    
                    if len(meaningful_words) >= len(acronym_clean):
                        first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)])
                        if first_letters == acronym_clean:
                            full_forms[acronym] = candidate
                            break  # Found a match, use the first one
            except re.error:
                # Invalid regex, skip this acronym
                continue
        
        return full_forms
    
    def _find_definitions(self, content: str) -> Dict[str, int]:
        """Find all acronym definitions and their positions."""
        definitions = {}
        
        for match in self.DEFINITION_PATTERN.finditer(content):
            # Get acronym from either pattern
            acronym = match.group(2) or match.group(3)
            if acronym:
                acronym = acronym.rstrip('s')  # Remove plural
                definitions[acronym] = match.start()
        
        return definitions
    
    def _find_all_usages(self, content: str) -> Dict[str, List[int]]:
        """Find all acronym usages, excluding special contexts."""
        usages = defaultdict(list)
        
        for match in self.ACRONYM_PATTERN.finditer(content):
            acronym = match.group(1).rstrip('s')
            pos = match.start()
            
            # Skip if in special context
            if self._is_in_special_context(content, pos, acronym):
                continue
            
            usages[acronym].append(pos)
        
        return usages
    
    def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool:
        """Check if acronym at position is in a special context that should be ignored."""
        # Get surrounding context
        start = max(0, pos - 50)
        end = min(len(content), pos + len(acronym) + 50)
        before = content[start:pos]
        after = content[pos + len(acronym):end]
        
        # Skip if inside definition parentheses: (ACRONYM)
        if before.endswith('(') and after.startswith(')'):
            return True
        
        # Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM}
        if before.rstrip().endswith('\\'):
            return True
        
        # Skip if inside label: \label{...:ACRONYM...}
        if r'\label{' in before[-20:] and '}' in after[:20]:
            return True
        
        # Skip if inside ref: \ref{...:ACRONYM...}
        if re.search(r'\\(?:ref|cite|autoref|cref|eqref)\{[^}]*$', before[-30:]):
            return True
        
        # Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM...
        if r'\url{' in before[-20:] or 'http' in before[-20:]:
            return True
        
        # Skip if inside math mode (simple heuristic)
        # Count $ signs before position
        dollar_count = before.count('$') - before.count(r'\$')
        if dollar_count % 2 == 1:  # Odd number means we're inside math mode
            return True
        
        # Skip if inside \begin{equation} or similar
        if re.search(r'\\begin\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
            if not re.search(r'\\end\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
                return True
        
        # Skip if it looks like a LaTeX command argument: \command[ACRONYM]
        if before.endswith('[') and after.startswith(']'):
            return True
        
        # Skip if part of a file path or extension
        if '.' in before[-5:] or '/' in before[-10:]:
            return True
        
        return False