thinkwee
commited on
Commit
·
39b06be
1
Parent(s):
4a7a641
update metadata compare
Browse files
src/analyzers/metadata_comparator.py
CHANGED
|
@@ -74,7 +74,7 @@ class MetadataComparator:
|
|
| 74 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 75 |
|
| 76 |
if not title_match:
|
| 77 |
-
issues.append(f"Title mismatch
|
| 78 |
|
| 79 |
# Compare authors
|
| 80 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
@@ -84,7 +84,7 @@ class MetadataComparator:
|
|
| 84 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 85 |
|
| 86 |
if not author_match:
|
| 87 |
-
issues.append(f"Author mismatch (
|
| 88 |
|
| 89 |
# Compare years
|
| 90 |
bib_year = bib_entry.year.strip()
|
|
@@ -92,7 +92,7 @@ class MetadataComparator:
|
|
| 92 |
year_match = bib_year == arxiv_year
|
| 93 |
|
| 94 |
if not year_match and bib_year and arxiv_year:
|
| 95 |
-
issues.append(f"Year mismatch:
|
| 96 |
|
| 97 |
# Overall assessment
|
| 98 |
is_match = title_match and author_match
|
|
@@ -133,7 +133,7 @@ class MetadataComparator:
|
|
| 133 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 134 |
|
| 135 |
if not title_match:
|
| 136 |
-
issues.append(f"Title mismatch
|
| 137 |
|
| 138 |
# Compare authors (Scholar format is less structured)
|
| 139 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
@@ -145,7 +145,7 @@ class MetadataComparator:
|
|
| 145 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 146 |
|
| 147 |
if not author_match:
|
| 148 |
-
issues.append(f"Author mismatch (
|
| 149 |
|
| 150 |
# Compare years
|
| 151 |
bib_year = bib_entry.year.strip()
|
|
@@ -153,7 +153,7 @@ class MetadataComparator:
|
|
| 153 |
year_match = bib_year == scholar_year
|
| 154 |
|
| 155 |
if not year_match and bib_year and scholar_year:
|
| 156 |
-
issues.append(f"Year mismatch:
|
| 157 |
|
| 158 |
# Overall assessment
|
| 159 |
is_match = title_match and author_match
|
|
@@ -194,7 +194,7 @@ class MetadataComparator:
|
|
| 194 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 195 |
|
| 196 |
if not title_match:
|
| 197 |
-
issues.append(f"Title mismatch
|
| 198 |
|
| 199 |
# Compare authors
|
| 200 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
@@ -204,7 +204,7 @@ class MetadataComparator:
|
|
| 204 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 205 |
|
| 206 |
if not author_match:
|
| 207 |
-
issues.append(f"Author mismatch (
|
| 208 |
|
| 209 |
# Compare years
|
| 210 |
bib_year = bib_entry.year.strip()
|
|
@@ -212,7 +212,7 @@ class MetadataComparator:
|
|
| 212 |
year_match = bib_year == crossref_year
|
| 213 |
|
| 214 |
if not year_match and bib_year and crossref_year:
|
| 215 |
-
issues.append(f"Year mismatch:
|
| 216 |
|
| 217 |
# Overall assessment
|
| 218 |
is_match = title_match and author_match
|
|
@@ -312,7 +312,7 @@ class MetadataComparator:
|
|
| 312 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 313 |
|
| 314 |
if not title_match:
|
| 315 |
-
issues.append(f"Title mismatch
|
| 316 |
|
| 317 |
# Compare authors
|
| 318 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
@@ -322,7 +322,7 @@ class MetadataComparator:
|
|
| 322 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 323 |
|
| 324 |
if not author_match:
|
| 325 |
-
issues.append(f"Author mismatch (
|
| 326 |
|
| 327 |
# Compare years
|
| 328 |
bib_year = bib_entry.year.strip()
|
|
@@ -330,7 +330,7 @@ class MetadataComparator:
|
|
| 330 |
year_match = bib_year == ss_year
|
| 331 |
|
| 332 |
if not year_match and bib_year and ss_year:
|
| 333 |
-
issues.append(f"Year mismatch:
|
| 334 |
|
| 335 |
# Overall assessment
|
| 336 |
is_match = title_match and author_match
|
|
@@ -371,7 +371,7 @@ class MetadataComparator:
|
|
| 371 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 372 |
|
| 373 |
if not title_match:
|
| 374 |
-
issues.append(f"Title mismatch
|
| 375 |
|
| 376 |
# Compare authors
|
| 377 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
@@ -381,7 +381,7 @@ class MetadataComparator:
|
|
| 381 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 382 |
|
| 383 |
if not author_match:
|
| 384 |
-
issues.append(f"Author mismatch (
|
| 385 |
|
| 386 |
# Compare years
|
| 387 |
bib_year = bib_entry.year.strip()
|
|
@@ -389,7 +389,7 @@ class MetadataComparator:
|
|
| 389 |
year_match = bib_year == oa_year
|
| 390 |
|
| 391 |
if not year_match and bib_year and oa_year:
|
| 392 |
-
issues.append(f"Year mismatch:
|
| 393 |
|
| 394 |
# Overall assessment
|
| 395 |
is_match = title_match and author_match
|
|
@@ -430,7 +430,7 @@ class MetadataComparator:
|
|
| 430 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 431 |
|
| 432 |
if not title_match:
|
| 433 |
-
issues.append(f"Title mismatch
|
| 434 |
|
| 435 |
# Compare authors
|
| 436 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
@@ -440,7 +440,7 @@ class MetadataComparator:
|
|
| 440 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 441 |
|
| 442 |
if not author_match:
|
| 443 |
-
issues.append(f"Author mismatch (
|
| 444 |
|
| 445 |
# Compare years
|
| 446 |
bib_year = bib_entry.year.strip()
|
|
@@ -448,7 +448,7 @@ class MetadataComparator:
|
|
| 448 |
year_match = bib_year == dblp_year
|
| 449 |
|
| 450 |
if not year_match and bib_year and dblp_year:
|
| 451 |
-
issues.append(f"Year mismatch:
|
| 452 |
|
| 453 |
# Overall assessment
|
| 454 |
is_match = title_match and author_match
|
|
|
|
| 74 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 75 |
|
| 76 |
if not title_match:
|
| 77 |
+
issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{arxiv_meta.title}'")
|
| 78 |
|
| 79 |
# Compare authors
|
| 80 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
|
|
| 84 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 85 |
|
| 86 |
if not author_match:
|
| 87 |
+
issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(arxiv_authors)}")
|
| 88 |
|
| 89 |
# Compare years
|
| 90 |
bib_year = bib_entry.year.strip()
|
|
|
|
| 92 |
year_match = bib_year == arxiv_year
|
| 93 |
|
| 94 |
if not year_match and bib_year and arxiv_year:
|
| 95 |
+
issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {arxiv_year}")
|
| 96 |
|
| 97 |
# Overall assessment
|
| 98 |
is_match = title_match and author_match
|
|
|
|
| 133 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 134 |
|
| 135 |
if not title_match:
|
| 136 |
+
issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{scholar_result.title}'")
|
| 137 |
|
| 138 |
# Compare authors (Scholar format is less structured)
|
| 139 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
|
|
| 145 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 146 |
|
| 147 |
if not author_match:
|
| 148 |
+
issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(scholar_authors)}")
|
| 149 |
|
| 150 |
# Compare years
|
| 151 |
bib_year = bib_entry.year.strip()
|
|
|
|
| 153 |
year_match = bib_year == scholar_year
|
| 154 |
|
| 155 |
if not year_match and bib_year and scholar_year:
|
| 156 |
+
issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {scholar_year}")
|
| 157 |
|
| 158 |
# Overall assessment
|
| 159 |
is_match = title_match and author_match
|
|
|
|
| 194 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 195 |
|
| 196 |
if not title_match:
|
| 197 |
+
issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{crossref_result.title}'")
|
| 198 |
|
| 199 |
# Compare authors
|
| 200 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
|
|
| 204 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 205 |
|
| 206 |
if not author_match:
|
| 207 |
+
issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(crossref_authors)}")
|
| 208 |
|
| 209 |
# Compare years
|
| 210 |
bib_year = bib_entry.year.strip()
|
|
|
|
| 212 |
year_match = bib_year == crossref_year
|
| 213 |
|
| 214 |
if not year_match and bib_year and crossref_year:
|
| 215 |
+
issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {crossref_year}")
|
| 216 |
|
| 217 |
# Overall assessment
|
| 218 |
is_match = title_match and author_match
|
|
|
|
| 312 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 313 |
|
| 314 |
if not title_match:
|
| 315 |
+
issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{ss_result.title}'")
|
| 316 |
|
| 317 |
# Compare authors
|
| 318 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
|
|
| 322 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 323 |
|
| 324 |
if not author_match:
|
| 325 |
+
issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(ss_authors)}")
|
| 326 |
|
| 327 |
# Compare years
|
| 328 |
bib_year = bib_entry.year.strip()
|
|
|
|
| 330 |
year_match = bib_year == ss_year
|
| 331 |
|
| 332 |
if not year_match and bib_year and ss_year:
|
| 333 |
+
issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {ss_year}")
|
| 334 |
|
| 335 |
# Overall assessment
|
| 336 |
is_match = title_match and author_match
|
|
|
|
| 371 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 372 |
|
| 373 |
if not title_match:
|
| 374 |
+
issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{oa_result.title}'")
|
| 375 |
|
| 376 |
# Compare authors
|
| 377 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
|
|
| 381 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 382 |
|
| 383 |
if not author_match:
|
| 384 |
+
issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(oa_authors)}")
|
| 385 |
|
| 386 |
# Compare years
|
| 387 |
bib_year = bib_entry.year.strip()
|
|
|
|
| 389 |
year_match = bib_year == oa_year
|
| 390 |
|
| 391 |
if not year_match and bib_year and oa_year:
|
| 392 |
+
issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {oa_year}")
|
| 393 |
|
| 394 |
# Overall assessment
|
| 395 |
is_match = title_match and author_match
|
|
|
|
| 430 |
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 431 |
|
| 432 |
if not title_match:
|
| 433 |
+
issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{dblp_result.title}'")
|
| 434 |
|
| 435 |
# Compare authors
|
| 436 |
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
|
|
|
| 440 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 441 |
|
| 442 |
if not author_match:
|
| 443 |
+
issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(dblp_authors)}")
|
| 444 |
|
| 445 |
# Compare years
|
| 446 |
bib_year = bib_entry.year.strip()
|
|
|
|
| 448 |
year_match = bib_year == dblp_year
|
| 449 |
|
| 450 |
if not year_match and bib_year and dblp_year:
|
| 451 |
+
issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {dblp_year}")
|
| 452 |
|
| 453 |
# Overall assessment
|
| 454 |
is_match = title_match and author_match
|