""" Metadata comparison between bib entries and fetched metadata. """ from dataclasses import dataclass from typing import Optional from ..parsers.bib_parser import BibEntry from ..fetchers.arxiv_fetcher import ArxivMetadata from ..fetchers.scholar_fetcher import ScholarResult from ..fetchers.crossref_fetcher import CrossRefResult from ..fetchers.semantic_scholar_fetcher import SemanticScholarResult from ..fetchers.openalex_fetcher import OpenAlexResult from ..fetchers.dblp_fetcher import DBLPResult from ..utils.normalizer import TextNormalizer @dataclass class ComparisonResult: """Result of comparing bib entry with fetched metadata.""" entry_key: str # Title comparison title_match: bool title_similarity: float bib_title: str fetched_title: str # Author comparison author_match: bool author_similarity: float bib_authors: list[str] fetched_authors: list[str] # Year comparison year_match: bool bib_year: str fetched_year: str # Overall assessment is_match: bool confidence: float issues: list[str] source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable' @property def has_issues(self) -> bool: return len(self.issues) > 0 class MetadataComparator: """Compares bibliography entries with fetched metadata.""" # Thresholds for matching TITLE_THRESHOLD = 0.8 AUTHOR_THRESHOLD = 0.6 def __init__(self): self.normalizer = TextNormalizer def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult: """Compare bib entry with arXiv metadata.""" issues = [] # Compare titles bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title) title_similarity = self.normalizer.similarity_ratio(bib_title_norm, arxiv_title_norm) # Also try Levenshtein for short titles if len(bib_title_norm) < 100: lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, arxiv_title_norm) title_similarity = max(title_similarity, lev_sim) title_match = title_similarity >= self.TITLE_THRESHOLD if not title_match: issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{arxiv_meta.title}'") # Compare authors bib_authors = self.normalizer.normalize_author_list(bib_entry.author) arxiv_authors = [self.normalizer.normalize_author_name(a) for a in arxiv_meta.authors] author_similarity = self._compare_author_lists(bib_authors, arxiv_authors) author_match = author_similarity >= self.AUTHOR_THRESHOLD if not author_match: issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(arxiv_authors)}") # Compare years bib_year = bib_entry.year.strip() arxiv_year = arxiv_meta.year year_match = bib_year == arxiv_year if not year_match and bib_year and arxiv_year: issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {arxiv_year}") # Overall assessment is_match = title_match and author_match confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) return ComparisonResult( entry_key=bib_entry.key, title_match=title_match, title_similarity=title_similarity, bib_title=bib_entry.title, fetched_title=arxiv_meta.title, author_match=author_match, author_similarity=author_similarity, bib_authors=bib_authors, fetched_authors=arxiv_authors, year_match=year_match, bib_year=bib_year, fetched_year=arxiv_year, is_match=is_match, confidence=confidence, issues=issues, source="arxiv" ) def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult: """Compare bib entry with Scholar search result.""" issues = [] # Compare titles bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) scholar_title_norm = self.normalizer.normalize_for_comparison(scholar_result.title) title_similarity = self.normalizer.similarity_ratio(bib_title_norm, scholar_title_norm) if len(bib_title_norm) < 100: lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, scholar_title_norm) title_similarity = max(title_similarity, lev_sim) title_match = title_similarity >= self.TITLE_THRESHOLD if not title_match: issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{scholar_result.title}'") # Compare authors (Scholar format is less structured) bib_authors = self.normalizer.normalize_author_list(bib_entry.author) # Scholar authors are comma-separated scholar_authors_raw = scholar_result.authors.split(',') scholar_authors = [self.normalizer.normalize_author_name(a.strip()) for a in scholar_authors_raw] author_similarity = self._compare_author_lists(bib_authors, scholar_authors) author_match = author_similarity >= self.AUTHOR_THRESHOLD if not author_match: issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(scholar_authors)}") # Compare years bib_year = bib_entry.year.strip() scholar_year = scholar_result.year year_match = bib_year == scholar_year if not year_match and bib_year and scholar_year: issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {scholar_year}") # Overall assessment is_match = title_match and author_match confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) return ComparisonResult( entry_key=bib_entry.key, title_match=title_match, title_similarity=title_similarity, bib_title=bib_entry.title, fetched_title=scholar_result.title, author_match=author_match, author_similarity=author_similarity, bib_authors=bib_authors, fetched_authors=scholar_authors, year_match=year_match, bib_year=bib_year, fetched_year=scholar_year, is_match=is_match, confidence=confidence, issues=issues, source="scholar" ) def compare_with_crossref(self, bib_entry: BibEntry, crossref_result: CrossRefResult) -> ComparisonResult: """Compare bib entry with CrossRef search result.""" issues = [] # Compare titles bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) crossref_title_norm = self.normalizer.normalize_for_comparison(crossref_result.title) title_similarity = self.normalizer.similarity_ratio(bib_title_norm, crossref_title_norm) if len(bib_title_norm) < 100: lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, crossref_title_norm) title_similarity = max(title_similarity, lev_sim) title_match = title_similarity >= self.TITLE_THRESHOLD if not title_match: issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{crossref_result.title}'") # Compare authors bib_authors = self.normalizer.normalize_author_list(bib_entry.author) crossref_authors = [self.normalizer.normalize_author_name(a) for a in crossref_result.authors] author_similarity = self._compare_author_lists(bib_authors, crossref_authors) author_match = author_similarity >= self.AUTHOR_THRESHOLD if not author_match: issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(crossref_authors)}") # Compare years bib_year = bib_entry.year.strip() crossref_year = crossref_result.year year_match = bib_year == crossref_year if not year_match and bib_year and crossref_year: issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {crossref_year}") # Overall assessment is_match = title_match and author_match confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) return ComparisonResult( entry_key=bib_entry.key, title_match=title_match, title_similarity=title_similarity, bib_title=bib_entry.title, fetched_title=crossref_result.title, author_match=author_match, author_similarity=author_similarity, bib_authors=bib_authors, fetched_authors=crossref_authors, year_match=year_match, bib_year=bib_year, fetched_year=crossref_year, is_match=is_match, confidence=confidence, issues=issues, source="crossref" ) def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult: """Create result when metadata couldn't be fetched.""" return ComparisonResult( entry_key=bib_entry.key, title_match=False, title_similarity=0.0, bib_title=bib_entry.title, fetched_title="", author_match=False, author_similarity=0.0, bib_authors=self.normalizer.normalize_author_list(bib_entry.author), fetched_authors=[], year_match=False, bib_year=bib_entry.year, fetched_year="", is_match=False, confidence=0.0, issues=[reason], source="unable" ) def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float: """Compare two author lists.""" if not list1 and not list2: return 1.0 if not list1 or not list2: return 0.0 # Find best matches for each author in list1 total_similarity = 0.0 for author1 in list1: best_match = 0.0 for author2 in list2: # Check if one name contains the other (handle abbreviated names) if self._names_match(author1, author2): best_match = 1.0 break sim = self.normalizer.similarity_ratio(author1, author2) best_match = max(best_match, sim) total_similarity += best_match return total_similarity / len(list1) def _names_match(self, name1: str, name2: str) -> bool: """Check if two names match (handles abbreviated names).""" words1 = name1.split() words2 = name2.split() if not words1 or not words2: return False # Check if last names match if words1[-1] != words2[-1]: # Try first word as last name too if words1[0] != words2[-1] and words1[-1] != words2[0]: return False return True def compare_with_semantic_scholar(self, bib_entry: BibEntry, ss_result: SemanticScholarResult) -> ComparisonResult: """Compare bib entry with Semantic Scholar result.""" issues = [] # Compare titles bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) ss_title_norm = self.normalizer.normalize_for_comparison(ss_result.title) title_similarity = self.normalizer.similarity_ratio(bib_title_norm, ss_title_norm) if len(bib_title_norm) < 100: lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, ss_title_norm) title_similarity = max(title_similarity, lev_sim) title_match = title_similarity >= self.TITLE_THRESHOLD if not title_match: issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{ss_result.title}'") # Compare authors bib_authors = self.normalizer.normalize_author_list(bib_entry.author) ss_authors = [self.normalizer.normalize_author_name(a) for a in ss_result.authors] author_similarity = self._compare_author_lists(bib_authors, ss_authors) author_match = author_similarity >= self.AUTHOR_THRESHOLD if not author_match: issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(ss_authors)}") # Compare years bib_year = bib_entry.year.strip() ss_year = ss_result.year year_match = bib_year == ss_year if not year_match and bib_year and ss_year: issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {ss_year}") # Overall assessment is_match = title_match and author_match confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) return ComparisonResult( entry_key=bib_entry.key, title_match=title_match, title_similarity=title_similarity, bib_title=bib_entry.title, fetched_title=ss_result.title, author_match=author_match, author_similarity=author_similarity, bib_authors=bib_authors, fetched_authors=ss_authors, year_match=year_match, bib_year=bib_year, fetched_year=ss_year, is_match=is_match, confidence=confidence, issues=issues, source="semantic_scholar" ) def compare_with_openalex(self, bib_entry: BibEntry, oa_result: OpenAlexResult) -> ComparisonResult: """Compare bib entry with OpenAlex result.""" issues = [] # Compare titles bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) oa_title_norm = self.normalizer.normalize_for_comparison(oa_result.title) title_similarity = self.normalizer.similarity_ratio(bib_title_norm, oa_title_norm) if len(bib_title_norm) < 100: lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, oa_title_norm) title_similarity = max(title_similarity, lev_sim) title_match = title_similarity >= self.TITLE_THRESHOLD if not title_match: issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{oa_result.title}'") # Compare authors bib_authors = self.normalizer.normalize_author_list(bib_entry.author) oa_authors = [self.normalizer.normalize_author_name(a) for a in oa_result.authors] author_similarity = self._compare_author_lists(bib_authors, oa_authors) author_match = author_similarity >= self.AUTHOR_THRESHOLD if not author_match: issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(oa_authors)}") # Compare years bib_year = bib_entry.year.strip() oa_year = oa_result.year year_match = bib_year == oa_year if not year_match and bib_year and oa_year: issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {oa_year}") # Overall assessment is_match = title_match and author_match confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) return ComparisonResult( entry_key=bib_entry.key, title_match=title_match, title_similarity=title_similarity, bib_title=bib_entry.title, fetched_title=oa_result.title, author_match=author_match, author_similarity=author_similarity, bib_authors=bib_authors, fetched_authors=oa_authors, year_match=year_match, bib_year=bib_year, fetched_year=oa_year, is_match=is_match, confidence=confidence, issues=issues, source="openalex" ) def compare_with_dblp(self, bib_entry: BibEntry, dblp_result: DBLPResult) -> ComparisonResult: """Compare bib entry with DBLP result.""" issues = [] # Compare titles bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) dblp_title_norm = self.normalizer.normalize_for_comparison(dblp_result.title) title_similarity = self.normalizer.similarity_ratio(bib_title_norm, dblp_title_norm) if len(bib_title_norm) < 100: lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, dblp_title_norm) title_similarity = max(title_similarity, lev_sim) title_match = title_similarity >= self.TITLE_THRESHOLD if not title_match: issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{dblp_result.title}'") # Compare authors bib_authors = self.normalizer.normalize_author_list(bib_entry.author) dblp_authors = [self.normalizer.normalize_author_name(a) for a in dblp_result.authors] author_similarity = self._compare_author_lists(bib_authors, dblp_authors) author_match = author_similarity >= self.AUTHOR_THRESHOLD if not author_match: issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(dblp_authors)}") # Compare years bib_year = bib_entry.year.strip() dblp_year = dblp_result.year year_match = bib_year == dblp_year if not year_match and bib_year and dblp_year: issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {dblp_year}") # Overall assessment is_match = title_match and author_match confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) return ComparisonResult( entry_key=bib_entry.key, title_match=title_match, title_similarity=title_similarity, bib_title=bib_entry.title, fetched_title=dblp_result.title, author_match=author_match, author_similarity=author_similarity, bib_authors=bib_authors, fetched_authors=dblp_authors, year_match=year_match, bib_year=bib_year, fetched_year=dblp_year, is_match=is_match, confidence=confidence, issues=issues, source="dblp" )