""" Text normalization utilities for comparing bibliography entries. """ import re import unicodedata from unidecode import unidecode class TextNormalizer: """Utility class for normalizing text for comparison.""" # LaTeX command patterns LATEX_COMMANDS = [ (r'\\textbf\{([^}]*)\}', r'\1'), (r'\\textit\{([^}]*)\}', r'\1'), (r'\\emph\{([^}]*)\}', r'\1'), (r'\\textrm\{([^}]*)\}', r'\1'), (r'\\texttt\{([^}]*)\}', r'\1'), (r'\\textsf\{([^}]*)\}', r'\1'), (r'\\textsc\{([^}]*)\}', r'\1'), (r'\\text\{([^}]*)\}', r'\1'), (r'\\mathrm\{([^}]*)\}', r'\1'), (r'\\mathbf\{([^}]*)\}', r'\1'), (r'\\mathit\{([^}]*)\}', r'\1'), (r'\\url\{([^}]*)\}', r'\1'), (r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'), ] # LaTeX special character mappings LATEX_CHARS = { r'\&': '&', r'\%': '%', r'\$': '$', r'\#': '#', r'\_': '_', r'\{': '{', r'\}': '}', r'\~': '~', r'\^': '^', r'``': '"', r"''": '"', r'`': "'", r"'": "'", r'--': '–', r'---': '—', } # LaTeX accent commands LATEX_ACCENTS = [ (r"\\'([aeiouAEIOU])", r'\1'), # acute (r'\\`([aeiouAEIOU])', r'\1'), # grave (r'\\^([aeiouAEIOU])', r'\1'), # circumflex (r'\\"([aeiouAEIOU])', r'\1'), # umlaut (r'\\~([nNaAoO])', r'\1'), # tilde (r'\\c\{([cC])\}', r'\1'), # cedilla (r"\\'{([aeiouAEIOU])}", r'\1'), (r'\\`{([aeiouAEIOU])}', r'\1'), (r'\\^{([aeiouAEIOU])}', r'\1'), (r'\\"{([aeiouAEIOU])}', r'\1'), (r'\\~{([nNaAoO])}', r'\1'), ] @classmethod def normalize_latex(cls, text: str) -> str: """Remove LaTeX formatting commands.""" if not text: return "" result = text # Remove LaTeX commands for pattern, replacement in cls.LATEX_COMMANDS: result = re.sub(pattern, replacement, result) # Handle LaTeX accents for pattern, replacement in cls.LATEX_ACCENTS: result = re.sub(pattern, replacement, result) # Replace LaTeX special characters for latex_char, normal_char in cls.LATEX_CHARS.items(): result = result.replace(latex_char, normal_char) # Remove remaining braces result = re.sub(r'[{}]', '', result) return result @classmethod def normalize_unicode(cls, text: str) -> str: """Normalize Unicode characters to ASCII.""" if not text: return "" # Normalize unicode text = unicodedata.normalize('NFKD', text) # Convert to ASCII text = unidecode(text) return text @classmethod def normalize_whitespace(cls, text: str) -> str: """Normalize whitespace.""" if not text: return "" # Replace multiple whitespace with single space text = re.sub(r'\s+', ' ', text) # Strip leading/trailing whitespace text = text.strip() return text @classmethod def remove_punctuation(cls, text: str) -> str: """Remove punctuation for comparison.""" if not text: return "" # Keep alphanumeric and spaces only return re.sub(r'[^\w\s]', '', text) @classmethod def normalize_for_comparison(cls, text: str) -> str: """ Full normalization pipeline for text comparison. Steps: 1. Remove LaTeX formatting 2. Normalize Unicode to ASCII 3. Convert to lowercase 4. Normalize whitespace 5. Remove punctuation """ if not text: return "" text = cls.normalize_latex(text) text = cls.normalize_unicode(text) text = text.lower() text = cls.normalize_whitespace(text) text = cls.remove_punctuation(text) return text @classmethod def normalize_author_name(cls, name: str) -> str: """ Normalize author name format. Handles: "Last, First" and "First Last" formats. Returns: normalized "first last" format. """ if not name: return "" name = cls.normalize_latex(name) name = cls.normalize_unicode(name) name = cls.normalize_whitespace(name) # Handle "Last, First" format if ',' in name: parts = name.split(',', 1) if len(parts) == 2: name = f"{parts[1].strip()} {parts[0].strip()}" name = name.lower() name = cls.remove_punctuation(name) return name @classmethod def normalize_author_list(cls, authors: str) -> list[str]: """ Parse and normalize a list of authors. Handles "and" as separator and "Last, First" format. """ if not authors: return [] # Split by " and " author_list = re.split(r'\s+and\s+', authors, flags=re.IGNORECASE) # Normalize each author normalized = [] for author in author_list: normalized_name = cls.normalize_author_name(author.strip()) if normalized_name: normalized.append(normalized_name) return normalized @classmethod def similarity_ratio(cls, text1: str, text2: str) -> float: """ Calculate similarity ratio between two strings. Uses simple word-based Jaccard similarity. """ if not text1 or not text2: return 0.0 words1 = set(text1.split()) words2 = set(text2.split()) if not words1 and not words2: return 1.0 if not words1 or not words2: return 0.0 intersection = words1 & words2 union = words1 | words2 return len(intersection) / len(union) @classmethod def levenshtein_similarity(cls, s1: str, s2: str) -> float: """Calculate normalized Levenshtein similarity.""" if not s1 and not s2: return 1.0 if not s1 or not s2: return 0.0 # Simple Levenshtein implementation m, n = len(s1), len(s2) dp = [[0] * (n + 1) for _ in range(m + 1)] for i in range(m + 1): dp[i][0] = i for j in range(n + 1): dp[0][j] = j for i in range(1, m + 1): for j in range(1, n + 1): if s1[i-1] == s2[j-1]: dp[i][j] = dp[i-1][j-1] else: dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1 max_len = max(m, n) distance = dp[m][n] return 1.0 - (distance / max_len)