File size: 7,181 Bytes
46df5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""
Duplicate entry detector for bibliography files.
Uses fuzzy matching to find potential duplicates.
"""
from dataclasses import dataclass
from typing import List, Tuple

from ..parsers.bib_parser import BibEntry
from ..utils.normalizer import TextNormalizer


@dataclass
class DuplicateGroup:
    """A group of potentially duplicate entries."""
    entries: List[BibEntry]
    similarity_score: float
    reason: str
    
    @property
    def entry_keys(self) -> List[str]:
        return [e.key for e in self.entries]


class DuplicateDetector:
    """Detects duplicate bibliography entries using fuzzy matching."""
    
    # Thresholds for duplicate detection
    TITLE_SIMILARITY_THRESHOLD = 0.85
    COMBINED_SIMILARITY_THRESHOLD = 0.80
    
    def __init__(self):
        self.normalizer = TextNormalizer
    
    def find_duplicates(self, entries: List[BibEntry]) -> List[DuplicateGroup]:
        """
        Find all duplicate groups in the bibliography.
        
        Returns:
            List of DuplicateGroup objects, each containing 2+ similar entries.
        """
        duplicates = []
        processed = set()
        
        for i, entry1 in enumerate(entries):
            if entry1.key in processed:
                continue
            
            # Find all entries similar to this one
            similar_entries = [entry1]
            
            for j, entry2 in enumerate(entries[i+1:], start=i+1):
                if entry2.key in processed:
                    continue
                
                similarity, reason = self._calculate_similarity(entry1, entry2)
                
                if similarity >= self.COMBINED_SIMILARITY_THRESHOLD:
                    similar_entries.append(entry2)
                    processed.add(entry2.key)
            
            # If we found duplicates, create a group
            if len(similar_entries) > 1:
                processed.add(entry1.key)
                
                # Calculate average similarity for the group
                avg_similarity = self._calculate_group_similarity(similar_entries)
                reason = self._generate_reason(similar_entries)
                
                duplicates.append(DuplicateGroup(
                    entries=similar_entries,
                    similarity_score=avg_similarity,
                    reason=reason
                ))
        
        # Sort by similarity score (highest first)
        duplicates.sort(key=lambda g: g.similarity_score, reverse=True)
        
        return duplicates
    
    def _calculate_similarity(self, entry1: BibEntry, entry2: BibEntry) -> Tuple[float, str]:
        """
        Calculate similarity between two entries.
        
        Returns:
            (similarity_score, reason_string)
        """
        # Normalize titles
        title1 = self.normalizer.normalize_for_comparison(entry1.title)
        title2 = self.normalizer.normalize_for_comparison(entry2.title)
        
        # Calculate title similarity
        title_sim = self.normalizer.similarity_ratio(title1, title2)
        
        # If titles are very similar, likely a duplicate
        if title_sim >= self.TITLE_SIMILARITY_THRESHOLD:
            return title_sim, "Very similar titles"
        
        # Check author similarity
        author_sim = self._calculate_author_similarity(entry1, entry2)
        
        # Combined score: weighted average
        # Title is more important (70%) than authors (30%)
        combined_sim = 0.7 * title_sim + 0.3 * author_sim
        
        if combined_sim >= self.COMBINED_SIMILARITY_THRESHOLD:
            return combined_sim, f"Similar title ({title_sim:.0%}) and authors ({author_sim:.0%})"
        
        return combined_sim, ""
    
    def _calculate_author_similarity(self, entry1: BibEntry, entry2: BibEntry) -> float:
        """Calculate similarity between author lists."""
        # Parse author strings
        authors1 = self._parse_authors(entry1.author)
        authors2 = self._parse_authors(entry2.author)
        
        if not authors1 or not authors2:
            return 0.0
        
        # Normalize author names
        norm_authors1 = [self.normalizer.normalize_for_comparison(a) for a in authors1]
        norm_authors2 = [self.normalizer.normalize_for_comparison(a) for a in authors2]
        
        # Count matching authors
        matches = 0
        for a1 in norm_authors1:
            for a2 in norm_authors2:
                if self._authors_match(a1, a2):
                    matches += 1
                    break
        
        # Calculate Jaccard similarity
        total_unique = len(set(norm_authors1) | set(norm_authors2))
        if total_unique == 0:
            return 0.0
        
        return matches / total_unique
    
    def _parse_authors(self, author_string: str) -> List[str]:
        """Parse author string into list of names."""
        if not author_string:
            return []
        
        # Split by 'and'
        authors = author_string.split(' and ')
        
        # Clean up each author
        cleaned = []
        for author in authors:
            # Remove extra whitespace
            author = ' '.join(author.split())
            if author:
                cleaned.append(author)
        
        return cleaned
    
    def _authors_match(self, name1: str, name2: str) -> bool:
        """Check if two author names match (handles initials)."""
        # Simple exact match after normalization
        if name1 == name2:
            return True
        
        # Check if one is a substring of the other (handles initials)
        if name1 in name2 or name2 in name1:
            return True
        
        # Calculate string similarity
        sim = self.normalizer.similarity_ratio(name1, name2)
        return sim >= 0.8
    
    def _calculate_group_similarity(self, entries: List[BibEntry]) -> float:
        """Calculate average similarity within a group."""
        if len(entries) < 2:
            return 1.0
        
        total_sim = 0.0
        count = 0
        
        for i, entry1 in enumerate(entries):
            for entry2 in entries[i+1:]:
                sim, _ = self._calculate_similarity(entry1, entry2)
                total_sim += sim
                count += 1
        
        return total_sim / count if count > 0 else 0.0
    
    def _generate_reason(self, entries: List[BibEntry]) -> str:
        """Generate a human-readable reason for the duplicate group."""
        # Check if all titles are very similar
        titles = [self.normalizer.normalize_for_comparison(e.title) for e in entries]
        
        # Calculate pairwise title similarities
        title_sims = []
        for i, t1 in enumerate(titles):
            for t2 in titles[i+1:]:
                title_sims.append(self.normalizer.similarity_ratio(t1, t2))
        
        avg_title_sim = sum(title_sims) / len(title_sims) if title_sims else 0.0
        
        if avg_title_sim >= 0.95:
            return "Nearly identical titles"
        elif avg_title_sim >= 0.85:
            return "Very similar titles"
        else:
            return "Similar titles and authors"