thinkwee commited on
Commit
39b06be
·
1 Parent(s): 4a7a641

update metadata compare

Browse files
Files changed (1) hide show
  1. src/analyzers/metadata_comparator.py +18 -18
src/analyzers/metadata_comparator.py CHANGED
@@ -74,7 +74,7 @@ class MetadataComparator:
74
  title_match = title_similarity >= self.TITLE_THRESHOLD
75
 
76
  if not title_match:
77
- issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
78
 
79
  # Compare authors
80
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
@@ -84,7 +84,7 @@ class MetadataComparator:
84
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
85
 
86
  if not author_match:
87
- issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
88
 
89
  # Compare years
90
  bib_year = bib_entry.year.strip()
@@ -92,7 +92,7 @@ class MetadataComparator:
92
  year_match = bib_year == arxiv_year
93
 
94
  if not year_match and bib_year and arxiv_year:
95
- issues.append(f"Year mismatch: bib={bib_year}, arxiv={arxiv_year}")
96
 
97
  # Overall assessment
98
  is_match = title_match and author_match
@@ -133,7 +133,7 @@ class MetadataComparator:
133
  title_match = title_similarity >= self.TITLE_THRESHOLD
134
 
135
  if not title_match:
136
- issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
137
 
138
  # Compare authors (Scholar format is less structured)
139
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
@@ -145,7 +145,7 @@ class MetadataComparator:
145
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
146
 
147
  if not author_match:
148
- issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
149
 
150
  # Compare years
151
  bib_year = bib_entry.year.strip()
@@ -153,7 +153,7 @@ class MetadataComparator:
153
  year_match = bib_year == scholar_year
154
 
155
  if not year_match and bib_year and scholar_year:
156
- issues.append(f"Year mismatch: bib={bib_year}, scholar={scholar_year}")
157
 
158
  # Overall assessment
159
  is_match = title_match and author_match
@@ -194,7 +194,7 @@ class MetadataComparator:
194
  title_match = title_similarity >= self.TITLE_THRESHOLD
195
 
196
  if not title_match:
197
- issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
198
 
199
  # Compare authors
200
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
@@ -204,7 +204,7 @@ class MetadataComparator:
204
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
205
 
206
  if not author_match:
207
- issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
208
 
209
  # Compare years
210
  bib_year = bib_entry.year.strip()
@@ -212,7 +212,7 @@ class MetadataComparator:
212
  year_match = bib_year == crossref_year
213
 
214
  if not year_match and bib_year and crossref_year:
215
- issues.append(f"Year mismatch: bib={bib_year}, crossref={crossref_year}")
216
 
217
  # Overall assessment
218
  is_match = title_match and author_match
@@ -312,7 +312,7 @@ class MetadataComparator:
312
  title_match = title_similarity >= self.TITLE_THRESHOLD
313
 
314
  if not title_match:
315
- issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
316
 
317
  # Compare authors
318
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
@@ -322,7 +322,7 @@ class MetadataComparator:
322
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
323
 
324
  if not author_match:
325
- issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
326
 
327
  # Compare years
328
  bib_year = bib_entry.year.strip()
@@ -330,7 +330,7 @@ class MetadataComparator:
330
  year_match = bib_year == ss_year
331
 
332
  if not year_match and bib_year and ss_year:
333
- issues.append(f"Year mismatch: bib={bib_year}, semantic_scholar={ss_year}")
334
 
335
  # Overall assessment
336
  is_match = title_match and author_match
@@ -371,7 +371,7 @@ class MetadataComparator:
371
  title_match = title_similarity >= self.TITLE_THRESHOLD
372
 
373
  if not title_match:
374
- issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
375
 
376
  # Compare authors
377
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
@@ -381,7 +381,7 @@ class MetadataComparator:
381
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
382
 
383
  if not author_match:
384
- issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
385
 
386
  # Compare years
387
  bib_year = bib_entry.year.strip()
@@ -389,7 +389,7 @@ class MetadataComparator:
389
  year_match = bib_year == oa_year
390
 
391
  if not year_match and bib_year and oa_year:
392
- issues.append(f"Year mismatch: bib={bib_year}, openalex={oa_year}")
393
 
394
  # Overall assessment
395
  is_match = title_match and author_match
@@ -430,7 +430,7 @@ class MetadataComparator:
430
  title_match = title_similarity >= self.TITLE_THRESHOLD
431
 
432
  if not title_match:
433
- issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
434
 
435
  # Compare authors
436
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
@@ -440,7 +440,7 @@ class MetadataComparator:
440
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
441
 
442
  if not author_match:
443
- issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
444
 
445
  # Compare years
446
  bib_year = bib_entry.year.strip()
@@ -448,7 +448,7 @@ class MetadataComparator:
448
  year_match = bib_year == dblp_year
449
 
450
  if not year_match and bib_year and dblp_year:
451
- issues.append(f"Year mismatch: bib={bib_year}, dblp={dblp_year}")
452
 
453
  # Overall assessment
454
  is_match = title_match and author_match
 
74
  title_match = title_similarity >= self.TITLE_THRESHOLD
75
 
76
  if not title_match:
77
+ issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{arxiv_meta.title}'")
78
 
79
  # Compare authors
80
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
 
84
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
85
 
86
  if not author_match:
87
+ issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(arxiv_authors)}")
88
 
89
  # Compare years
90
  bib_year = bib_entry.year.strip()
 
92
  year_match = bib_year == arxiv_year
93
 
94
  if not year_match and bib_year and arxiv_year:
95
+ issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {arxiv_year}")
96
 
97
  # Overall assessment
98
  is_match = title_match and author_match
 
133
  title_match = title_similarity >= self.TITLE_THRESHOLD
134
 
135
  if not title_match:
136
+ issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{scholar_result.title}'")
137
 
138
  # Compare authors (Scholar format is less structured)
139
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
 
145
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
146
 
147
  if not author_match:
148
+ issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(scholar_authors)}")
149
 
150
  # Compare years
151
  bib_year = bib_entry.year.strip()
 
153
  year_match = bib_year == scholar_year
154
 
155
  if not year_match and bib_year and scholar_year:
156
+ issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {scholar_year}")
157
 
158
  # Overall assessment
159
  is_match = title_match and author_match
 
194
  title_match = title_similarity >= self.TITLE_THRESHOLD
195
 
196
  if not title_match:
197
+ issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{crossref_result.title}'")
198
 
199
  # Compare authors
200
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
 
204
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
205
 
206
  if not author_match:
207
+ issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(crossref_authors)}")
208
 
209
  # Compare years
210
  bib_year = bib_entry.year.strip()
 
212
  year_match = bib_year == crossref_year
213
 
214
  if not year_match and bib_year and crossref_year:
215
+ issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {crossref_year}")
216
 
217
  # Overall assessment
218
  is_match = title_match and author_match
 
312
  title_match = title_similarity >= self.TITLE_THRESHOLD
313
 
314
  if not title_match:
315
+ issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{ss_result.title}'")
316
 
317
  # Compare authors
318
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
 
322
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
323
 
324
  if not author_match:
325
+ issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(ss_authors)}")
326
 
327
  # Compare years
328
  bib_year = bib_entry.year.strip()
 
330
  year_match = bib_year == ss_year
331
 
332
  if not year_match and bib_year and ss_year:
333
+ issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {ss_year}")
334
 
335
  # Overall assessment
336
  is_match = title_match and author_match
 
371
  title_match = title_similarity >= self.TITLE_THRESHOLD
372
 
373
  if not title_match:
374
+ issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{oa_result.title}'")
375
 
376
  # Compare authors
377
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
 
381
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
382
 
383
  if not author_match:
384
+ issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(oa_authors)}")
385
 
386
  # Compare years
387
  bib_year = bib_entry.year.strip()
 
389
  year_match = bib_year == oa_year
390
 
391
  if not year_match and bib_year and oa_year:
392
+ issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {oa_year}")
393
 
394
  # Overall assessment
395
  is_match = title_match and author_match
 
430
  title_match = title_similarity >= self.TITLE_THRESHOLD
431
 
432
  if not title_match:
433
+ issues.append(f"Title mismatch. Bib: '{bib_entry.title}', Retrieved: '{dblp_result.title}'")
434
 
435
  # Compare authors
436
  bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
 
440
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
441
 
442
  if not author_match:
443
+ issues.append(f"Author mismatch. Bib: {', '.join(bib_authors)}, Retrieved: {', '.join(dblp_authors)}")
444
 
445
  # Compare years
446
  bib_year = bib_entry.year.strip()
 
448
  year_match = bib_year == dblp_year
449
 
450
  if not year_match and bib_year and dblp_year:
451
+ issues.append(f"Year mismatch. Bib: {bib_year}, Retrieved: {dblp_year}")
452
 
453
  # Overall assessment
454
  is_match = title_match and author_match