| | from dataclasses import dataclass |
| | from datetime import datetime |
| | from typing import Optional, List |
| | from pathlib import Path |
| |
|
| | from ..parsers.bib_parser import BibEntry |
| | from ..analyzers.metadata_comparator import ComparisonResult |
| | from ..analyzers.duplicate_detector import DuplicateGroup |
| |
|
| |
|
| | @dataclass |
| | class EntryReport: |
| | """Report for a single bib entry (bib-only: entry + comparison).""" |
| | entry: BibEntry |
| | comparison: Optional[ComparisonResult] |
| |
|
| |
|
| | class ReportGenerator: |
| | """Generates bibliography-only markdown reports.""" |
| |
|
| | def __init__( |
| | self, |
| | minimal_verified: bool = False, |
| | check_preprint_ratio: bool = True, |
| | preprint_warning_threshold: float = 0.50, |
| | ): |
| | self.entries: List[EntryReport] = [] |
| | self.duplicate_groups: Optional[List[DuplicateGroup]] = None |
| | self.bib_files: List[str] = [] |
| | self.minimal_verified = minimal_verified |
| | self.check_preprint_ratio = check_preprint_ratio |
| | self.preprint_warning_threshold = preprint_warning_threshold |
| |
|
| | def add_entry_report(self, report: EntryReport): |
| | self.entries.append(report) |
| |
|
| | def set_metadata(self, bib_files: str | List[str], tex_files: str | List[str] = None): |
| | if isinstance(bib_files, str): |
| | self.bib_files = [bib_files] |
| | else: |
| | self.bib_files = list(bib_files) if bib_files else [] |
| |
|
| | def set_duplicate_groups(self, groups: List[DuplicateGroup]): |
| | self.duplicate_groups = groups |
| |
|
| | def _is_verified(self, entry: EntryReport) -> bool: |
| | return not self._has_issues(entry) |
| |
|
| | def _has_issues(self, entry: EntryReport) -> bool: |
| | return bool(entry.comparison and entry.comparison.has_issues) |
| |
|
| | def _is_preprint(self, entry: BibEntry) -> bool: |
| | preprint_keywords = [ |
| | "arxiv", "biorxiv", "medrxiv", "ssrn", "preprint", |
| | "openreview", "techreport", "technical report", "working paper", |
| | ] |
| | if entry.entry_type.lower() in ["techreport", "unpublished", "misc"]: |
| | text = " ".join([ |
| | entry.journal.lower(), entry.booktitle.lower(), |
| | entry.publisher.lower(), entry.entry_type.lower(), |
| | ]) |
| | if any(k in text for k in preprint_keywords): |
| | return True |
| | if entry.has_arxiv: |
| | return True |
| | venue = " ".join([entry.journal.lower(), entry.booktitle.lower(), entry.publisher.lower()]) |
| | return any(k in venue for k in preprint_keywords) |
| |
|
| | def get_summary_stats(self) -> dict: |
| | """Return bibliography issue counts only (no LaTeX).""" |
| | total = len(self.entries) |
| | title_mismatches = author_mismatches = year_mismatches = unable_to_verify = 0 |
| | for e in self.entries: |
| | if not e.comparison: |
| | continue |
| | if e.comparison.has_issues: |
| | for issue in e.comparison.issues: |
| | if "Title mismatch" in issue: |
| | title_mismatches += 1 |
| | elif "Author mismatch" in issue: |
| | author_mismatches += 1 |
| | elif "Year mismatch" in issue: |
| | year_mismatches += 1 |
| | elif "Unable to find" in issue: |
| | unable_to_verify += 1 |
| |
|
| | stats = {} |
| | if title_mismatches > 0: |
| | stats["Title Mismatches"] = title_mismatches |
| | if author_mismatches > 0: |
| | stats["Author Mismatches"] = author_mismatches |
| | if year_mismatches > 0: |
| | stats["Year Mismatches"] = year_mismatches |
| | if unable_to_verify > 0: |
| | stats["Unable to Verify"] = unable_to_verify |
| | if self.duplicate_groups: |
| | stats["Duplicate Groups"] = len(self.duplicate_groups) |
| | return stats |
| |
|
| | def _generate_issues_section(self) -> List[str]: |
| | lines = ["## β οΈ Critical Issues Detected", ""] |
| | has_any = False |
| |
|
| | if self.duplicate_groups: |
| | has_any = True |
| | lines.append("### π Duplicate Entries") |
| | for i, group in enumerate(self.duplicate_groups, 1): |
| | lines.append(f"#### Group {i} (Similarity: {group.similarity_score:.0%})") |
| | lines.append(f"**Reason:** {group.reason}") |
| | lines.append("") |
| | lines.append("| Key | Title | Year |") |
| | lines.append("|-----|-------|------|") |
| | for entry in group.entries: |
| | lines.append(f"| `{entry.key}` | {entry.title} | {entry.year} |") |
| | lines.append("") |
| |
|
| | issue_entries = [e for e in self.entries if self._has_issues(e)] |
| | if issue_entries: |
| | has_any = True |
| | lines.append("### β οΈ Metadata Issues") |
| | for report in issue_entries: |
| | lines.extend(self._format_entry_detail(report, is_verified=False)) |
| |
|
| | if not has_any: |
| | lines.append("π **No critical issues found!**") |
| | return lines |
| |
|
| | def _generate_verified_section(self) -> List[str]: |
| | lines = ["## β
Verified Entries", ""] |
| | verified = [e for e in self.entries if self._is_verified(e)] |
| | if not verified: |
| | lines.append("_No verified entries found._") |
| | return lines |
| | lines.append(f"Found **{len(verified)}** entries with correct metadata.") |
| | lines.append("") |
| | lines.append("<details>") |
| | lines.append("<summary>Click to view verified entries</summary>") |
| | lines.append("") |
| | for report in verified: |
| | lines.extend(self._format_entry_detail(report, minimal=self.minimal_verified, is_verified=True)) |
| | lines.append("</details>") |
| | return lines |
| |
|
| | def _format_entry_detail(self, report: EntryReport, minimal: bool = False, is_verified: bool = False) -> List[str]: |
| | entry = report.entry |
| | comp = report.comparison |
| | lines = [] |
| | icon = "β
" if is_verified else "β οΈ" |
| | lines.append(f"#### {icon} `{entry.key}`") |
| | lines.append(f"**Title:** {entry.title}") |
| | lines.append("") |
| | if comp: |
| | status_icon = "β
" if comp.is_match else "β" |
| | lines.append(f"- **Metadata Status:** {status_icon} {comp.source.upper()} (Confidence: {comp.confidence:.1%})") |
| | if comp.has_issues and not minimal: |
| | lines.append(" - **Discrepancies:**") |
| | for issue in comp.issues: |
| | if "Mismatch" in issue or "mismatch" in issue: |
| | lines.append(f" - π΄ {issue}") |
| | if "Title" in issue: |
| | lines.append(f" - **Bib:** `{comp.bib_title}`") |
| | lines.append(f" - **Fetched:** `{comp.fetched_title}`") |
| | elif "Author" in issue: |
| | lines.append(f" - **Bib:** `{', '.join(comp.bib_authors)}`") |
| | lines.append(f" - **Fetched:** `{', '.join(comp.fetched_authors)}`") |
| | else: |
| | lines.append(f" - πΈ {issue}") |
| | lines.append("") |
| | lines.append("---") |
| | lines.append("") |
| | return lines |
| |
|
| | def save_bibliography_report(self, filepath: str): |
| | """Generate and save bibliography-only report.""" |
| | total = len(self.entries) |
| | verified = sum(1 for e in self.entries if self._is_verified(e)) |
| | issues = sum(1 for e in self.entries if self._has_issues(e)) |
| | dup_str = str(len(self.duplicate_groups)) if self.duplicate_groups else "N/A" |
| |
|
| | preprint_str = "N/A" |
| | preprint_warning = [] |
| | if self.check_preprint_ratio and self.entries: |
| | preprint_count = sum(1 for e in self.entries if self._is_preprint(e.entry)) |
| | preprint_ratio = preprint_count / len(self.entries) |
| | preprint_str = f"{preprint_count} ({preprint_ratio:.1%})" |
| | if preprint_ratio > self.preprint_warning_threshold: |
| | preprint_warning = [ |
| | "", |
| | f"> β οΈ **High Preprint Ratio:** {preprint_ratio:.1%} of entries are preprints.", |
| | ] |
| |
|
| | bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A" |
| | lines = [ |
| | "# Bibliography Validation Report", |
| | "", |
| | f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", |
| | "", |
| | "| File Type | Filename |", |
| | "|-----------|----------|", |
| | f"| **Bib File(s)** | {bib_names} |", |
| | "", |
| | "> **β οΈ Disclaimer:** This report is generated by an automated tool. Please verify reported issues manually.", |
| | "", |
| | "## π Summary", |
| | "", |
| | "| Metric | Count |", |
| | "|--------|-------|", |
| | f"| **Total Entries** | {total} |", |
| | f"| β
**Verified (Clean)** | {verified} |", |
| | f"| β οΈ **With Issues** | {issues} |", |
| | f"| π **Duplicate Groups** | {dup_str} |", |
| | f"| π **Preprints** | {preprint_str} |", |
| | "", |
| | ] |
| | if preprint_warning: |
| | lines.extend(preprint_warning) |
| | lines.append("") |
| | lines.extend(self._generate_issues_section()) |
| | lines.append("") |
| | lines.extend(self._generate_verified_section()) |
| | lines.append("") |
| | lines.append("---") |
| | lines.append(f"Report generated by **CiteScan** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
| |
|
| | with open(filepath, "w", encoding="utf-8") as f: |
| | f.write("\n".join(lines)) |
| |
|