File size: 7,015 Bytes
46df5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""
Text normalization utilities for comparing bibliography entries.
"""
import re
import unicodedata
from unidecode import unidecode


class TextNormalizer:
    """Utility class for normalizing text for comparison."""
    
    # LaTeX command patterns
    LATEX_COMMANDS = [
        (r'\\textbf\{([^}]*)\}', r'\1'),
        (r'\\textit\{([^}]*)\}', r'\1'),
        (r'\\emph\{([^}]*)\}', r'\1'),
        (r'\\textrm\{([^}]*)\}', r'\1'),
        (r'\\texttt\{([^}]*)\}', r'\1'),
        (r'\\textsf\{([^}]*)\}', r'\1'),
        (r'\\textsc\{([^}]*)\}', r'\1'),
        (r'\\text\{([^}]*)\}', r'\1'),
        (r'\\mathrm\{([^}]*)\}', r'\1'),
        (r'\\mathbf\{([^}]*)\}', r'\1'),
        (r'\\mathit\{([^}]*)\}', r'\1'),
        (r'\\url\{([^}]*)\}', r'\1'),
        (r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'),
    ]
    
    # LaTeX special character mappings
    LATEX_CHARS = {
        r'\&': '&',
        r'\%': '%',
        r'\$': '$',
        r'\#': '#',
        r'\_': '_',
        r'\{': '{',
        r'\}': '}',
        r'\~': '~',
        r'\^': '^',
        r'``': '"',
        r"''": '"',
        r'`': "'",
        r"'": "'",
        r'--': '–',
        r'---': '—',
    }
    
    # LaTeX accent commands
    LATEX_ACCENTS = [
        (r"\\'([aeiouAEIOU])", r'\1'),  # acute
        (r'\\`([aeiouAEIOU])', r'\1'),   # grave
        (r'\\^([aeiouAEIOU])', r'\1'),   # circumflex
        (r'\\"([aeiouAEIOU])', r'\1'),   # umlaut
        (r'\\~([nNaAoO])', r'\1'),       # tilde
        (r'\\c\{([cC])\}', r'\1'),       # cedilla
        (r"\\'{([aeiouAEIOU])}", r'\1'),
        (r'\\`{([aeiouAEIOU])}', r'\1'),
        (r'\\^{([aeiouAEIOU])}', r'\1'),
        (r'\\"{([aeiouAEIOU])}', r'\1'),
        (r'\\~{([nNaAoO])}', r'\1'),
    ]
    
    @classmethod
    def normalize_latex(cls, text: str) -> str:
        """Remove LaTeX formatting commands."""
        if not text:
            return ""
        
        result = text
        
        # Remove LaTeX commands
        for pattern, replacement in cls.LATEX_COMMANDS:
            result = re.sub(pattern, replacement, result)
        
        # Handle LaTeX accents
        for pattern, replacement in cls.LATEX_ACCENTS:
            result = re.sub(pattern, replacement, result)
        
        # Replace LaTeX special characters
        for latex_char, normal_char in cls.LATEX_CHARS.items():
            result = result.replace(latex_char, normal_char)
        
        # Remove remaining braces
        result = re.sub(r'[{}]', '', result)
        
        return result
    
    @classmethod
    def normalize_unicode(cls, text: str) -> str:
        """Normalize Unicode characters to ASCII."""
        if not text:
            return ""
        
        # Normalize unicode
        text = unicodedata.normalize('NFKD', text)
        # Convert to ASCII
        text = unidecode(text)
        return text
    
    @classmethod
    def normalize_whitespace(cls, text: str) -> str:
        """Normalize whitespace."""
        if not text:
            return ""
        
        # Replace multiple whitespace with single space
        text = re.sub(r'\s+', ' ', text)
        # Strip leading/trailing whitespace
        text = text.strip()
        return text
    
    @classmethod
    def remove_punctuation(cls, text: str) -> str:
        """Remove punctuation for comparison."""
        if not text:
            return ""
        
        # Keep alphanumeric and spaces only
        return re.sub(r'[^\w\s]', '', text)
    
    @classmethod
    def normalize_for_comparison(cls, text: str) -> str:
        """
        Full normalization pipeline for text comparison.
        
        Steps:
        1. Remove LaTeX formatting
        2. Normalize Unicode to ASCII
        3. Convert to lowercase
        4. Normalize whitespace
        5. Remove punctuation
        """
        if not text:
            return ""
        
        text = cls.normalize_latex(text)
        text = cls.normalize_unicode(text)
        text = text.lower()
        text = cls.normalize_whitespace(text)
        text = cls.remove_punctuation(text)
        return text
    
    @classmethod
    def normalize_author_name(cls, name: str) -> str:
        """
        Normalize author name format.
        Handles: "Last, First" and "First Last" formats.
        Returns: normalized "first last" format.
        """
        if not name:
            return ""
        
        name = cls.normalize_latex(name)
        name = cls.normalize_unicode(name)
        name = cls.normalize_whitespace(name)
        
        # Handle "Last, First" format
        if ',' in name:
            parts = name.split(',', 1)
            if len(parts) == 2:
                name = f"{parts[1].strip()} {parts[0].strip()}"
        
        name = name.lower()
        name = cls.remove_punctuation(name)
        return name
    
    @classmethod
    def normalize_author_list(cls, authors: str) -> list[str]:
        """
        Parse and normalize a list of authors.
        Handles "and" as separator and "Last, First" format.
        """
        if not authors:
            return []
        
        # Split by " and "
        author_list = re.split(r'\s+and\s+', authors, flags=re.IGNORECASE)
        
        # Normalize each author
        normalized = []
        for author in author_list:
            normalized_name = cls.normalize_author_name(author.strip())
            if normalized_name:
                normalized.append(normalized_name)
        
        return normalized
    
    @classmethod
    def similarity_ratio(cls, text1: str, text2: str) -> float:
        """
        Calculate similarity ratio between two strings.
        Uses simple word-based Jaccard similarity.
        """
        if not text1 or not text2:
            return 0.0
        
        words1 = set(text1.split())
        words2 = set(text2.split())
        
        if not words1 and not words2:
            return 1.0
        if not words1 or not words2:
            return 0.0
        
        intersection = words1 & words2
        union = words1 | words2
        
        return len(intersection) / len(union)
    
    @classmethod
    def levenshtein_similarity(cls, s1: str, s2: str) -> float:
        """Calculate normalized Levenshtein similarity."""
        if not s1 and not s2:
            return 1.0
        if not s1 or not s2:
            return 0.0
        
        # Simple Levenshtein implementation
        m, n = len(s1), len(s2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        
        for i in range(m + 1):
            dp[i][0] = i
        for j in range(n + 1):
            dp[0][j] = j
        
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if s1[i-1] == s2[j-1]:
                    dp[i][j] = dp[i-1][j-1]
                else:
                    dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
        
        max_len = max(m, n)
        distance = dp[m][n]
        return 1.0 - (distance / max_len)