|
|
""" |
|
|
Duplicate entry detector for bibliography files. |
|
|
Uses fuzzy matching to find potential duplicates. |
|
|
""" |
|
|
from dataclasses import dataclass |
|
|
from typing import List, Tuple |
|
|
|
|
|
from ..parsers.bib_parser import BibEntry |
|
|
from ..utils.normalizer import TextNormalizer |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DuplicateGroup: |
|
|
"""A group of potentially duplicate entries.""" |
|
|
entries: List[BibEntry] |
|
|
similarity_score: float |
|
|
reason: str |
|
|
|
|
|
@property |
|
|
def entry_keys(self) -> List[str]: |
|
|
return [e.key for e in self.entries] |
|
|
|
|
|
|
|
|
class DuplicateDetector: |
|
|
"""Detects duplicate bibliography entries using fuzzy matching.""" |
|
|
|
|
|
|
|
|
TITLE_SIMILARITY_THRESHOLD = 0.85 |
|
|
COMBINED_SIMILARITY_THRESHOLD = 0.80 |
|
|
|
|
|
def __init__(self): |
|
|
self.normalizer = TextNormalizer |
|
|
|
|
|
def find_duplicates(self, entries: List[BibEntry]) -> List[DuplicateGroup]: |
|
|
""" |
|
|
Find all duplicate groups in the bibliography. |
|
|
|
|
|
Returns: |
|
|
List of DuplicateGroup objects, each containing 2+ similar entries. |
|
|
""" |
|
|
duplicates = [] |
|
|
processed = set() |
|
|
|
|
|
for i, entry1 in enumerate(entries): |
|
|
if entry1.key in processed: |
|
|
continue |
|
|
|
|
|
|
|
|
similar_entries = [entry1] |
|
|
|
|
|
for j, entry2 in enumerate(entries[i+1:], start=i+1): |
|
|
if entry2.key in processed: |
|
|
continue |
|
|
|
|
|
similarity, reason = self._calculate_similarity(entry1, entry2) |
|
|
|
|
|
if similarity >= self.COMBINED_SIMILARITY_THRESHOLD: |
|
|
similar_entries.append(entry2) |
|
|
processed.add(entry2.key) |
|
|
|
|
|
|
|
|
if len(similar_entries) > 1: |
|
|
processed.add(entry1.key) |
|
|
|
|
|
|
|
|
avg_similarity = self._calculate_group_similarity(similar_entries) |
|
|
reason = self._generate_reason(similar_entries) |
|
|
|
|
|
duplicates.append(DuplicateGroup( |
|
|
entries=similar_entries, |
|
|
similarity_score=avg_similarity, |
|
|
reason=reason |
|
|
)) |
|
|
|
|
|
|
|
|
duplicates.sort(key=lambda g: g.similarity_score, reverse=True) |
|
|
|
|
|
return duplicates |
|
|
|
|
|
def _calculate_similarity(self, entry1: BibEntry, entry2: BibEntry) -> Tuple[float, str]: |
|
|
""" |
|
|
Calculate similarity between two entries. |
|
|
|
|
|
Returns: |
|
|
(similarity_score, reason_string) |
|
|
""" |
|
|
|
|
|
title1 = self.normalizer.normalize_for_comparison(entry1.title) |
|
|
title2 = self.normalizer.normalize_for_comparison(entry2.title) |
|
|
|
|
|
|
|
|
title_sim = self.normalizer.similarity_ratio(title1, title2) |
|
|
|
|
|
|
|
|
if title_sim >= self.TITLE_SIMILARITY_THRESHOLD: |
|
|
return title_sim, "Very similar titles" |
|
|
|
|
|
|
|
|
author_sim = self._calculate_author_similarity(entry1, entry2) |
|
|
|
|
|
|
|
|
|
|
|
combined_sim = 0.7 * title_sim + 0.3 * author_sim |
|
|
|
|
|
if combined_sim >= self.COMBINED_SIMILARITY_THRESHOLD: |
|
|
return combined_sim, f"Similar title ({title_sim:.0%}) and authors ({author_sim:.0%})" |
|
|
|
|
|
return combined_sim, "" |
|
|
|
|
|
def _calculate_author_similarity(self, entry1: BibEntry, entry2: BibEntry) -> float: |
|
|
"""Calculate similarity between author lists.""" |
|
|
|
|
|
authors1 = self._parse_authors(entry1.author) |
|
|
authors2 = self._parse_authors(entry2.author) |
|
|
|
|
|
if not authors1 or not authors2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
norm_authors1 = [self.normalizer.normalize_for_comparison(a) for a in authors1] |
|
|
norm_authors2 = [self.normalizer.normalize_for_comparison(a) for a in authors2] |
|
|
|
|
|
|
|
|
matches = 0 |
|
|
for a1 in norm_authors1: |
|
|
for a2 in norm_authors2: |
|
|
if self._authors_match(a1, a2): |
|
|
matches += 1 |
|
|
break |
|
|
|
|
|
|
|
|
total_unique = len(set(norm_authors1) | set(norm_authors2)) |
|
|
if total_unique == 0: |
|
|
return 0.0 |
|
|
|
|
|
return matches / total_unique |
|
|
|
|
|
def _parse_authors(self, author_string: str) -> List[str]: |
|
|
"""Parse author string into list of names.""" |
|
|
if not author_string: |
|
|
return [] |
|
|
|
|
|
|
|
|
authors = author_string.split(' and ') |
|
|
|
|
|
|
|
|
cleaned = [] |
|
|
for author in authors: |
|
|
|
|
|
author = ' '.join(author.split()) |
|
|
if author: |
|
|
cleaned.append(author) |
|
|
|
|
|
return cleaned |
|
|
|
|
|
def _authors_match(self, name1: str, name2: str) -> bool: |
|
|
"""Check if two author names match (handles initials).""" |
|
|
|
|
|
if name1 == name2: |
|
|
return True |
|
|
|
|
|
|
|
|
if name1 in name2 or name2 in name1: |
|
|
return True |
|
|
|
|
|
|
|
|
sim = self.normalizer.similarity_ratio(name1, name2) |
|
|
return sim >= 0.8 |
|
|
|
|
|
def _calculate_group_similarity(self, entries: List[BibEntry]) -> float: |
|
|
"""Calculate average similarity within a group.""" |
|
|
if len(entries) < 2: |
|
|
return 1.0 |
|
|
|
|
|
total_sim = 0.0 |
|
|
count = 0 |
|
|
|
|
|
for i, entry1 in enumerate(entries): |
|
|
for entry2 in entries[i+1:]: |
|
|
sim, _ = self._calculate_similarity(entry1, entry2) |
|
|
total_sim += sim |
|
|
count += 1 |
|
|
|
|
|
return total_sim / count if count > 0 else 0.0 |
|
|
|
|
|
def _generate_reason(self, entries: List[BibEntry]) -> str: |
|
|
"""Generate a human-readable reason for the duplicate group.""" |
|
|
|
|
|
titles = [self.normalizer.normalize_for_comparison(e.title) for e in entries] |
|
|
|
|
|
|
|
|
title_sims = [] |
|
|
for i, t1 in enumerate(titles): |
|
|
for t2 in titles[i+1:]: |
|
|
title_sims.append(self.normalizer.similarity_ratio(t1, t2)) |
|
|
|
|
|
avg_title_sim = sum(title_sims) / len(title_sims) if title_sims else 0.0 |
|
|
|
|
|
if avg_title_sim >= 0.95: |
|
|
return "Nearly identical titles" |
|
|
elif avg_title_sim >= 0.85: |
|
|
return "Very similar titles" |
|
|
else: |
|
|
return "Similar titles and authors" |
|
|
|