Glossarion / advanced_duplicate_detection.py
Shirochi's picture
Upload 93 files
ec038f4 verified
# advanced_duplicate_detection.py
"""
Advanced duplicate detection for glossary entries.
Uses multiple algorithms and takes the best match.
"""
def get_similarity_score(name1, name2, threshold=0.90):
"""
Calculate similarity using multiple algorithms and return the best score.
Args:
name1: First name to compare
name2: Second name to compare
threshold: Minimum similarity threshold (0.0-1.0)
Returns:
float: Best similarity score from all algorithms (0.0-1.0)
"""
if not name1 or not name2:
return 0.0
# Quick exact match check
if name1.lower() == name2.lower():
return 1.0
scores = []
# Try RapidFuzz first (fastest)
try:
from rapidfuzz import fuzz
# Basic ratio
ratio = fuzz.ratio(name1.lower(), name2.lower()) / 100.0
scores.append(ratio)
# Token sort (handles word order)
token_sort = fuzz.token_sort_ratio(name1.lower(), name2.lower()) / 100.0
scores.append(token_sort)
# Partial ratio (substring matching)
partial = fuzz.partial_ratio(name1.lower(), name2.lower()) / 100.0
scores.append(partial)
except ImportError:
pass
# Try TheFuzz/FuzzyWuzzy (more sophisticated)
try:
from thefuzz import fuzz as tfuzz
# Token set ratio (best for name variations)
token_set = tfuzz.token_set_ratio(name1, name2) / 100.0
scores.append(token_set)
except ImportError:
pass
# Try Jellyfish (phonetic matching for names)
try:
import jellyfish
# Jaro-Winkler (designed for names, prioritizes prefix matches)
jaro = jellyfish.jaro_winkler_similarity(name1, name2)
scores.append(jaro)
except ImportError:
pass
# Try TextDistance (additional algorithms)
try:
import textdistance
# Jaro-Winkler from textdistance
jw = textdistance.jaro_winkler.normalized_similarity(name1, name2)
scores.append(jw)
# DamerauLevenshtein (handles transpositions)
dl = textdistance.damerau_levenshtein.normalized_similarity(name1, name2)
scores.append(dl)
except ImportError:
pass
# Fallback to difflib if no libraries available
if not scores:
from difflib import SequenceMatcher
ratio = SequenceMatcher(None, name1.lower(), name2.lower()).ratio()
scores.append(ratio)
# Return the maximum score from all algorithms
best_score = max(scores) if scores else 0.0
return best_score
def find_duplicates_advanced(entries, threshold=0.90, debug=False):
"""
Find duplicates using advanced multi-algorithm approach.
Args:
entries: List of dict entries with 'raw_name' field
threshold: Similarity threshold (0.0-1.0)
debug: Print debug information
Returns:
tuple: (deduplicated_entries, removed_count, duplicate_pairs)
"""
from remove_honorifics import remove_honorifics # Your existing function
seen_entries = [] # List of (cleaned_name, original_entry)
deduplicated = []
duplicate_pairs = [] # Track what was merged
removed_count = 0
if debug:
print(f"[AdvancedDedup] Processing {len(entries)} entries with threshold {threshold:.2f}")
for idx, entry in enumerate(entries):
raw_name = entry.get('raw_name', '')
if not raw_name:
continue
# Clean the name (remove honorifics)
cleaned_name = remove_honorifics(raw_name)
# Check against all seen entries
is_duplicate = False
best_match_score = 0.0
matched_with = None
for seen_clean, seen_entry in seen_entries:
# Get similarity score using multiple algorithms
score = get_similarity_score(cleaned_name, seen_clean, threshold)
if score >= threshold:
is_duplicate = True
if score > best_match_score:
best_match_score = score
matched_with = seen_entry.get('raw_name', '')
break
if is_duplicate:
removed_count += 1
duplicate_pairs.append({
'duplicate': raw_name,
'original': matched_with,
'score': best_match_score
})
if debug and removed_count <= 10:
print(f"[AdvancedDedup] Duplicate: '{raw_name}' matches '{matched_with}' (score: {best_match_score:.3f})")
else:
seen_entries.append((cleaned_name, entry))
deduplicated.append(entry)
if debug:
print(f"[AdvancedDedup] Removed {removed_count} duplicates")
print(f"[AdvancedDedup] Kept {len(deduplicated)} unique entries")
return deduplicated, removed_count, duplicate_pairs
def get_available_algorithms():
"""Check which algorithms are available"""
available = []
try:
import rapidfuzz
available.append("RapidFuzz (Basic + Token)")
except ImportError:
pass
try:
import thefuzz
available.append("TheFuzz (Token Set)")
except ImportError:
pass
try:
import jellyfish
available.append("Jellyfish (Jaro-Winkler)")
except ImportError:
pass
try:
import textdistance
available.append("TextDistance (Multiple)")
except ImportError:
pass
if not available:
available.append("difflib (Fallback)")
return available
if __name__ == "__main__":
from shutdown_utils import run_cli_main
def _main():
# Test the similarity scoring
print("Available algorithms:")
for algo in get_available_algorithms():
print(f" โœ“ {algo}")
print("\nTest cases:")
test_pairs = [
("๊น€์ƒํ˜„", "๊น€์ƒํ˜„๋‹˜"),
("Kim Sang-hyun", "Kim Sanghyun"),
("๊น€์ƒํ˜„", "๊น€์ƒํ˜"),
("Park Ji-sung", "Ji-sung Park"),
("็”ฐไธญๅคช้ƒŽ", "็”ฐไธญๅคช้ƒŽใ•ใ‚“"),
]
for name1, name2 in test_pairs:
score = get_similarity_score(name1, name2)
status = "โœ“ MATCH" if score >= 0.90 else "โœ— DIFFERENT"
print(f"{status} '{name1}' vs '{name2}': {score:.3f}")
return 0
run_cli_main(_main)