|
|
""" |
|
|
Text normalization utilities for comparing bibliography entries. |
|
|
""" |
|
|
import re |
|
|
import unicodedata |
|
|
from unidecode import unidecode |
|
|
|
|
|
|
|
|
class TextNormalizer: |
|
|
"""Utility class for normalizing text for comparison.""" |
|
|
|
|
|
|
|
|
LATEX_COMMANDS = [ |
|
|
(r'\\textbf\{([^}]*)\}', r'\1'), |
|
|
(r'\\textit\{([^}]*)\}', r'\1'), |
|
|
(r'\\emph\{([^}]*)\}', r'\1'), |
|
|
(r'\\textrm\{([^}]*)\}', r'\1'), |
|
|
(r'\\texttt\{([^}]*)\}', r'\1'), |
|
|
(r'\\textsf\{([^}]*)\}', r'\1'), |
|
|
(r'\\textsc\{([^}]*)\}', r'\1'), |
|
|
(r'\\text\{([^}]*)\}', r'\1'), |
|
|
(r'\\mathrm\{([^}]*)\}', r'\1'), |
|
|
(r'\\mathbf\{([^}]*)\}', r'\1'), |
|
|
(r'\\mathit\{([^}]*)\}', r'\1'), |
|
|
(r'\\url\{([^}]*)\}', r'\1'), |
|
|
(r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'), |
|
|
] |
|
|
|
|
|
|
|
|
LATEX_CHARS = { |
|
|
r'\&': '&', |
|
|
r'\%': '%', |
|
|
r'\$': '$', |
|
|
r'\#': '#', |
|
|
r'\_': '_', |
|
|
r'\{': '{', |
|
|
r'\}': '}', |
|
|
r'\~': '~', |
|
|
r'\^': '^', |
|
|
r'``': '"', |
|
|
r"''": '"', |
|
|
r'`': "'", |
|
|
r"'": "'", |
|
|
r'--': '–', |
|
|
r'---': '—', |
|
|
} |
|
|
|
|
|
|
|
|
LATEX_ACCENTS = [ |
|
|
(r"\\'([aeiouAEIOU])", r'\1'), |
|
|
(r'\\`([aeiouAEIOU])', r'\1'), |
|
|
(r'\\^([aeiouAEIOU])', r'\1'), |
|
|
(r'\\"([aeiouAEIOU])', r'\1'), |
|
|
(r'\\~([nNaAoO])', r'\1'), |
|
|
(r'\\c\{([cC])\}', r'\1'), |
|
|
(r"\\'{([aeiouAEIOU])}", r'\1'), |
|
|
(r'\\`{([aeiouAEIOU])}', r'\1'), |
|
|
(r'\\^{([aeiouAEIOU])}', r'\1'), |
|
|
(r'\\"{([aeiouAEIOU])}', r'\1'), |
|
|
(r'\\~{([nNaAoO])}', r'\1'), |
|
|
] |
|
|
|
|
|
@classmethod |
|
|
def normalize_latex(cls, text: str) -> str: |
|
|
"""Remove LaTeX formatting commands.""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
result = text |
|
|
|
|
|
|
|
|
for pattern, replacement in cls.LATEX_COMMANDS: |
|
|
result = re.sub(pattern, replacement, result) |
|
|
|
|
|
|
|
|
for pattern, replacement in cls.LATEX_ACCENTS: |
|
|
result = re.sub(pattern, replacement, result) |
|
|
|
|
|
|
|
|
for latex_char, normal_char in cls.LATEX_CHARS.items(): |
|
|
result = result.replace(latex_char, normal_char) |
|
|
|
|
|
|
|
|
result = re.sub(r'[{}]', '', result) |
|
|
|
|
|
return result |
|
|
|
|
|
@classmethod |
|
|
def normalize_unicode(cls, text: str) -> str: |
|
|
"""Normalize Unicode characters to ASCII.""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = unicodedata.normalize('NFKD', text) |
|
|
|
|
|
text = unidecode(text) |
|
|
return text |
|
|
|
|
|
@classmethod |
|
|
def normalize_whitespace(cls, text: str) -> str: |
|
|
"""Normalize whitespace.""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = text.strip() |
|
|
return text |
|
|
|
|
|
@classmethod |
|
|
def remove_punctuation(cls, text: str) -> str: |
|
|
"""Remove punctuation for comparison.""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
return re.sub(r'[^\w\s]', '', text) |
|
|
|
|
|
@classmethod |
|
|
def normalize_for_comparison(cls, text: str) -> str: |
|
|
""" |
|
|
Full normalization pipeline for text comparison. |
|
|
|
|
|
Steps: |
|
|
1. Remove LaTeX formatting |
|
|
2. Normalize Unicode to ASCII |
|
|
3. Convert to lowercase |
|
|
4. Normalize whitespace |
|
|
5. Remove punctuation |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
text = cls.normalize_latex(text) |
|
|
text = cls.normalize_unicode(text) |
|
|
text = text.lower() |
|
|
text = cls.normalize_whitespace(text) |
|
|
text = cls.remove_punctuation(text) |
|
|
return text |
|
|
|
|
|
@classmethod |
|
|
def normalize_author_name(cls, name: str) -> str: |
|
|
""" |
|
|
Normalize author name format. |
|
|
Handles: "Last, First" and "First Last" formats. |
|
|
Returns: normalized "first last" format. |
|
|
""" |
|
|
if not name: |
|
|
return "" |
|
|
|
|
|
name = cls.normalize_latex(name) |
|
|
name = cls.normalize_unicode(name) |
|
|
name = cls.normalize_whitespace(name) |
|
|
|
|
|
|
|
|
if ',' in name: |
|
|
parts = name.split(',', 1) |
|
|
if len(parts) == 2: |
|
|
name = f"{parts[1].strip()} {parts[0].strip()}" |
|
|
|
|
|
name = name.lower() |
|
|
name = cls.remove_punctuation(name) |
|
|
return name |
|
|
|
|
|
@classmethod |
|
|
def normalize_author_list(cls, authors: str) -> list[str]: |
|
|
""" |
|
|
Parse and normalize a list of authors. |
|
|
Handles "and" as separator and "Last, First" format. |
|
|
""" |
|
|
if not authors: |
|
|
return [] |
|
|
|
|
|
|
|
|
author_list = re.split(r'\s+and\s+', authors, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
normalized = [] |
|
|
for author in author_list: |
|
|
normalized_name = cls.normalize_author_name(author.strip()) |
|
|
if normalized_name: |
|
|
normalized.append(normalized_name) |
|
|
|
|
|
return normalized |
|
|
|
|
|
@classmethod |
|
|
def similarity_ratio(cls, text1: str, text2: str) -> float: |
|
|
""" |
|
|
Calculate similarity ratio between two strings. |
|
|
Uses simple word-based Jaccard similarity. |
|
|
""" |
|
|
if not text1 or not text2: |
|
|
return 0.0 |
|
|
|
|
|
words1 = set(text1.split()) |
|
|
words2 = set(text2.split()) |
|
|
|
|
|
if not words1 and not words2: |
|
|
return 1.0 |
|
|
if not words1 or not words2: |
|
|
return 0.0 |
|
|
|
|
|
intersection = words1 & words2 |
|
|
union = words1 | words2 |
|
|
|
|
|
return len(intersection) / len(union) |
|
|
|
|
|
@classmethod |
|
|
def levenshtein_similarity(cls, s1: str, s2: str) -> float: |
|
|
"""Calculate normalized Levenshtein similarity.""" |
|
|
if not s1 and not s2: |
|
|
return 1.0 |
|
|
if not s1 or not s2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
m, n = len(s1), len(s2) |
|
|
dp = [[0] * (n + 1) for _ in range(m + 1)] |
|
|
|
|
|
for i in range(m + 1): |
|
|
dp[i][0] = i |
|
|
for j in range(n + 1): |
|
|
dp[0][j] = j |
|
|
|
|
|
for i in range(1, m + 1): |
|
|
for j in range(1, n + 1): |
|
|
if s1[i-1] == s2[j-1]: |
|
|
dp[i][j] = dp[i-1][j-1] |
|
|
else: |
|
|
dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1 |
|
|
|
|
|
max_len = max(m, n) |
|
|
distance = dp[m][n] |
|
|
return 1.0 - (distance / max_len) |
|
|
|