"""
Report generator for bibliography check results.
"""
import json
import re
from dataclasses import asdict, dataclass, is_dataclass
from datetime import datetime
from typing import Any, Optional, List, Dict
from pathlib import Path
from ..parsers.bib_parser import BibEntry
from ..analyzers.metadata_comparator import ComparisonResult
from ..analyzers.usage_checker import UsageResult
from ..analyzers.llm_evaluator import EvaluationResult
from ..analyzers.duplicate_detector import DuplicateGroup
from ..checkers.base import CheckResult, CheckSeverity
from .html_report import render_standalone_html
@dataclass
class EntryReport:
"""Complete report for a single bib entry."""
entry: BibEntry
comparison: Optional[ComparisonResult]
usage: Optional[UsageResult]
evaluations: list[EvaluationResult]
def _json_default(o):
if is_dataclass(o):
return asdict(o)
if hasattr(o, "value"):
return o.value
return str(o)
class ReportGenerator:
"""Generates formatted markdown reports."""
def __init__(self, minimal_verified: bool = False, check_preprint_ratio: bool = True, preprint_warning_threshold: float = 0.50):
self.entries: list[EntryReport] = []
self.missing_citations: list[str] = []
self.duplicate_groups: list[DuplicateGroup] | None = None # None means check not run
self.bib_files: list[str] = []
self.tex_files: list[str] = []
self.bib_file: str = "" # Keep for backward compatibility/single file
self.tex_file: str = "" # Keep for backward compatibility/single file
self.minimal_verified = minimal_verified # Whether to show minimal info for verified entries
self.submission_results: List[CheckResult] = [] # Submission quality check results
self.template = None # Conference template if used
self.check_preprint_ratio = check_preprint_ratio # Whether to check preprint ratio
self.preprint_warning_threshold = preprint_warning_threshold # Threshold for preprint warning
self.retraction_findings: list = [] # F1 results
self.url_findings: list = [] # F2 results
def set_retraction_findings(self, findings) -> None:
self.retraction_findings = list(findings or [])
def set_url_findings(self, findings) -> None:
self.url_findings = list(findings or [])
def add_entry_report(self, report: EntryReport):
"""Add an entry report."""
self.entries.append(report)
def set_metadata(self, bib_files: str | list[str], tex_files: str | list[str]):
"""Set source file information."""
if isinstance(bib_files, str):
self.bib_files = [bib_files]
self.bib_file = bib_files
else:
self.bib_files = bib_files
self.bib_file = bib_files[0] if bib_files else ""
if isinstance(tex_files, str):
self.tex_files = [tex_files]
self.tex_file = tex_files
else:
self.tex_files = tex_files
self.tex_file = tex_files[0] if tex_files else ""
def set_missing_citations(self, missing: list[str]):
"""Set list of citations without bib entries."""
self.missing_citations = missing
def set_duplicate_groups(self, groups: list[DuplicateGroup]):
"""Set list of duplicate entry groups."""
self.duplicate_groups = groups
def set_submission_results(self, results: List[CheckResult], template=None):
"""Set submission quality check results."""
self.submission_results = results
self.template = template
def generate(self) -> str:
"""Generate the full markdown report."""
lines = []
# Header
lines.extend(self._generate_header())
lines.append("")
# Disclaimer
lines.extend(self._generate_disclaimer())
lines.append("")
# Summary statistics
lines.extend(self._generate_summary())
lines.append("")
# ⚠️ Critical Issues (Detailed) - Bibliography-related issues
lines.extend(self._generate_issues_section())
lines.append("")
# ✅ Verified Entries (Clean)
lines.extend(self._generate_verified_section())
lines.append("")
# 📋 Submission Quality Checks (LaTeX quality checks)
if self.submission_results:
lines.extend(self._generate_submission_section())
lines.append("")
# Footer
lines.extend(self._generate_footer())
return "\n".join(lines)
def get_summary_stats(self) -> tuple[dict, dict]:
"""Get summary statistics as dictionaries for console display (Issues only)."""
total = len(self.entries)
# Bibliography issues breakdown
title_mismatches = 0
author_mismatches = 0
year_mismatches = 0
low_relevance = 0
unable_to_verify = 0
for e in self.entries:
# Metadata issues
if e.comparison:
if e.comparison.has_issues:
# Categorize issues
has_title = False
has_author = False
has_year = False
for issue in e.comparison.issues:
if "Title mismatch" in issue: has_title = True
elif "Author mismatch" in issue: has_author = True
elif "Year mismatch" in issue: has_year = True
elif "Unable to find" in issue: unable_to_verify += 1
if has_title: title_mismatches += 1
if has_author: author_mismatches += 1
if has_year: year_mismatches += 1
# Relevance issues
if any(ev.relevance_score <= 2 for ev in e.evaluations):
low_relevance += 1
bib_stats = {}
if title_mismatches > 0: bib_stats["Title Mismatches"] = title_mismatches
if author_mismatches > 0: bib_stats["Author Mismatches"] = author_mismatches
if year_mismatches > 0: bib_stats["Year Mismatches"] = year_mismatches
if low_relevance > 0: bib_stats["Low Relevance"] = low_relevance
if unable_to_verify > 0: bib_stats["Unable to Verify"] = unable_to_verify
if self.duplicate_groups:
bib_stats["Duplicate Groups"] = len(self.duplicate_groups)
if self.missing_citations:
bib_stats["Missing Bib Entries"] = len(self.missing_citations)
unused = [e for e in self.entries if e.usage and not e.usage.is_used]
if unused:
bib_stats["Unused Entries"] = len(unused)
# LaTeX stats - Group by precise Rule Names
latex_stats = {}
# Rule mapping for professional display names
RULE_MAPPING = {
"Very long sentence": "Sentence Length (Critical)",
"Long sentence": "Sentence Length (Warning)",
"Possible Markdown bullet point": "Markdown Bullet Point",
"Possible Markdown numbered list": "Markdown Numbered List",
"Possible Markdown italic": "Markdown Italic",
"Possible Markdown bold": "Markdown Bold",
"Inconsistent hyphenation": "Hyphenation Inconsistency",
"Inconsistent spelling": "Spelling Inconsistency",
"Unreferenced figure": "Unreferenced Figure",
"Unreferenced table": "Unreferenced Table",
"Unreferenced section": "Unreferenced Section",
"Unreferenced label": "Unreferenced Label",
"Citation from": "Old Citation (10+ years)",
"Hedging language": "Hedging/Vague Language",
"Redundant phrase": "Redundant Phrasing",
"Weak start with": "Weak Sentence Starter",
"Unescaped &": "Unescaped Special Character",
"Citation without non-breaking space": "Missing Non-breaking Space (~)",
"Mixed citation styles": "Mixed Citation Styles",
"Mixed inline math": "Mixed Math Notation",
"Appendix section": "Unreferenced Appendix",
"Missing space before unit": "Unit Spacing Issue"
}
for r in self.submission_results:
if r.passed:
continue
raw_msg = r.message
rule_name = "Unknown Rule"
# Match against our professional rule names
matched = False
for pattern, official_name in RULE_MAPPING.items():
if pattern in raw_msg:
rule_name = official_name
matched = True
break
if not matched:
# Fallback: Clean the message (remove dynamic parts)
clean_msg = re.sub(r"\(.*?\)", "", raw_msg)
clean_msg = re.sub(r"'.*?'", "", clean_msg)
clean_msg = re.sub(r"\d+", "", clean_msg)
rule_name = clean_msg.split(":")[0].strip()
if rule_name not in latex_stats:
latex_stats[rule_name] = 0
latex_stats[rule_name] += 1
return bib_stats, latex_stats
def generate_console_output(self) -> str:
"""Generate console-friendly output (Summary + Issues only)."""
lines = []
# Summary statistics
lines.extend(self._generate_summary())
lines.append("")
# Critical Issues
lines.extend(self._generate_issues_section())
lines.append("")
return "\n".join(lines)
def _generate_header(self) -> list[str]:
"""Generate report header.
File names are intentionally not printed — keep the report
portable, and never expose local source paths to anyone the
report is shared with.
"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
return [
"# Bibliography Validation Report",
"",
f"**Generated:** {timestamp}",
"",
"| Inputs | Count |",
"|--------|-------|",
f"| **Bib File(s)** | {len(self.bib_files)} |",
f"| **TeX File(s)** | {len(self.tex_files)} |",
]
def _generate_disclaimer(self) -> list[str]:
"""Generate disclaimer section."""
return [
"> **⚠️ Disclaimer:** This report is generated by an automated tool. While BibGuard strives for accuracy, it may produce false positives or miss certain issues. **This tool cannot replace human review.** Please manually verify all reported issues before making changes to your bibliography."
]
def _generate_summary(self) -> list[str]:
"""Generate summary statistics."""
total = len(self.entries)
# Check availability of results
has_metadata = any(e.comparison is not None for e in self.entries)
has_usage = any(e.usage is not None for e in self.entries)
has_eval = any(len(e.evaluations) > 0 for e in self.entries)
# Calculate Verified/Issues
# Note: _is_verified depends on _has_issues.
# If a check wasn't run, it won't contribute to issues.
verified = sum(1 for e in self.entries if self._is_verified(e))
issues = sum(1 for e in self.entries if self._has_issues(e))
# Usage stats
if has_usage:
used = sum(1 for e in self.entries if e.usage and e.usage.is_used)
unused = total - used
used_str = str(used)
unused_str = str(unused)
missing_str = str(len(self.missing_citations))
else:
used_str = "N/A"
unused_str = "N/A"
missing_str = "N/A"
# Duplicate stats - show N/A if check wasn't run (duplicate_groups is None means not checked)
if self.duplicate_groups is None:
dup_str = "N/A"
else:
dup_str = str(len(self.duplicate_groups))
# Preprint detection (only if enabled)
preprint_str = "N/A"
preprint_warning = []
if self.check_preprint_ratio and has_usage:
used_entries = [e for e in self.entries if e.usage and e.usage.is_used]
if used_entries:
preprint_count = sum(1 for e in used_entries if self._is_preprint(e.entry))
preprint_ratio = preprint_count / len(used_entries)
preprint_str = f"{preprint_count} ({preprint_ratio:.1%})"
# Warning if exceeds threshold
if preprint_ratio > self.preprint_warning_threshold:
preprint_warning = [
"",
f"> ⚠️ **High Preprint Ratio Warning:** {preprint_ratio:.1%} of your used references are preprints (arXiv, bioRxiv, etc.). Consider replacing some with peer-reviewed publications if available."
]
summary_lines = [
"## 📊 Summary",
"",
"### 📚 Bibliography Statistics",
"",
"| Metric | Count |",
"|--------|-------|",
f"| **Total Entries** | {total} |",
f"| ✅ **Verified (Clean)** | {verified} |",
f"| ⚠️ **With Issues** | {issues} |",
f"| 📝 **Used in TeX** | {used_str} |",
f"| 🗑️ **Unused** | {unused_str} |",
f"| 🔄 **Duplicate Groups** | {dup_str} |",
f"| ❌ **Missing Bib Entries** | {missing_str} |",
f"| 📄 **Preprints (Used)** | {preprint_str} |",
]
# Add warning if needed
if preprint_warning:
summary_lines.extend(preprint_warning)
summary_lines.extend([
"",
"### 📋 LaTeX Quality Checks",
"",
self._get_submission_summary()
])
return summary_lines
def _is_preprint(self, entry: BibEntry) -> bool:
"""Check if an entry is a preprint."""
# Preprint indicators
preprint_keywords = [
'arxiv', 'biorxiv', 'medrxiv', 'ssrn', 'preprint',
'openreview', 'techreport', 'technical report', 'working paper',
'tech report', 'tech. report'
]
# Check entry type
if entry.entry_type.lower() in ['techreport', 'unpublished', 'misc']:
# Further check if it's actually a preprint
text_to_check = ' '.join([
entry.journal.lower(),
entry.booktitle.lower(),
entry.publisher.lower(),
entry.entry_type.lower()
])
if any(keyword in text_to_check for keyword in preprint_keywords):
return True
# Check if arXiv ID exists
if entry.has_arxiv:
return True
# Check journal/booktitle/publisher fields
venue_text = ' '.join([
entry.journal.lower(),
entry.booktitle.lower(),
entry.publisher.lower()
])
return any(keyword in venue_text for keyword in preprint_keywords)
def _get_submission_summary(self) -> str:
"""Generate submission quality summary table."""
if not self.submission_results:
return "*No quality checks were performed.*"
# Count by severity
error_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.ERROR)
warning_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.WARNING)
info_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.INFO)
lines = [
"| Severity | Count |",
"|----------|-------|",
f"| 🔴 **Errors** | {error_count} |",
f"| 🟡 **Warnings** | {warning_count} |",
f"| 🔵 **Suggestions** | {info_count} |"
]
return "\n".join(lines)
def _is_verified(self, entry: EntryReport) -> bool:
"""Check if entry is clean (no issues)."""
return not self._has_issues(entry)
def _has_issues(self, entry: EntryReport) -> bool:
"""Check if entry has any issues."""
# Metadata issues
if entry.comparison and entry.comparison.has_issues:
return True
# LLM issues (low relevance)
if any(ev.relevance_score <= 2 for ev in entry.evaluations):
return True
# NOTE: We don't include usage issues (unused) here because
# unused entries are already shown in the "Unused Entries" section
return False
def _has_metadata_or_relevance_issues(self, entry: EntryReport) -> bool:
"""Check if entry has metadata or relevance issues (excluding duplicate/unused)."""
# Metadata issues
if entry.comparison and entry.comparison.has_issues:
return True
# LLM issues (low relevance)
if any(ev.relevance_score <= 2 for ev in entry.evaluations):
return True
return False
def _generate_issues_section(self) -> list[str]:
"""Generate detailed section for entries with issues."""
lines = ["## ⚠️ Critical Issues Detected", ""]
has_any_issues = False
# 1. Missing Citations
if self.missing_citations:
has_any_issues = True
lines.append("### ❌ Missing Bibliography Entries")
lines.append("The following keys are cited in the TeX file but missing from the .bib file:")
lines.append("")
for key in self.missing_citations:
lines.append(f"- `{key}`")
lines.append("")
# 2. Duplicate Entries
if self.duplicate_groups:
has_any_issues = True
lines.append("### 🔄 Duplicate Entries")
for i, group in enumerate(self.duplicate_groups, 1):
lines.append(f"#### Group {i} (Similarity: {group.similarity_score:.0%})")
lines.append(f"**Reason:** {group.reason}")
lines.append("")
lines.append("| Key | Title | Year |")
lines.append("|-----|-------|------|")
for entry in group.entries:
lines.append(f"| `{entry.key}` | {entry.title} | {entry.year} |")
lines.append("")
# 3. Unused Entries
unused = [e for e in self.entries if e.usage and not e.usage.is_used]
if unused:
has_any_issues = True
lines.append("### 🗑️ Unused Entries")
lines.append("The following entries are in the .bib file but NOT cited in the TeX file:")
lines.append("")
for e in unused:
lines.append(f"- `{e.entry.key}`: *{e.entry.title}*")
lines.append("")
# 4. Metadata Mismatches & Low Relevance
issue_entries = [e for e in self.entries if self._has_metadata_or_relevance_issues(e)]
if issue_entries:
has_any_issues = True
lines.append("### ⚠️ Metadata & Relevance Issues")
for entry_report in issue_entries:
lines.extend(self._format_entry_detail(entry_report, is_verified=False))
if not has_any_issues:
lines.append("🎉 **No critical issues found!**")
return lines
def _generate_verified_section(self) -> list[str]:
"""Generate section for verified entries."""
lines = ["## ✅ Verified Entries", ""]
verified = [e for e in self.entries if self._is_verified(e)]
if not verified:
lines.append("_No verified entries found._")
return lines
lines.append(f"Found **{len(verified)}** entries with correct metadata.")
lines.append("")
# Use a collapsible details block for clean UI
lines.append("")
lines.append("Click to view verified entries
")
lines.append("")
for entry_report in verified:
lines.extend(self._format_entry_detail(entry_report, minimal=self.minimal_verified, is_verified=True))
lines.append(" ")
return lines
def _format_entry_detail(self, report: EntryReport, minimal: bool = False, is_verified: bool = False) -> list[str]:
"""Format a single entry report in Markdown."""
entry = report.entry
comp = report.comparison
lines = []
# Title header - use checkmark for verified entries, warning for issues
icon = "✅" if is_verified else "⚠️"
lines.append(f"#### {icon} `{entry.key}`")
lines.append(f"**Title:** {entry.title}")
lines.append("")
# Metadata Status
if comp:
status_icon = "✅" if comp.is_match else "❌"
lines.append(f"- **Metadata Status:** {status_icon} {comp.source.upper()} (Confidence: {comp.confidence:.1%})")
if comp.has_issues and not minimal:
lines.append(" - **Discrepancies:**")
for issue in comp.issues:
# Format mismatch details nicely
if "Mismatch" in issue or "mismatch" in issue:
lines.append(f" - 🔴 {issue}")
if "Title" in issue:
lines.append(f" - **Bib:** `{comp.bib_title}`")
lines.append(f" - **Fetched:** `{comp.fetched_title}`")
elif "Author" in issue:
lines.append(f" - **Bib:** `{', '.join(comp.bib_authors)}`")
lines.append(f" - **Fetched:** `{', '.join(comp.fetched_authors)}`")
else:
lines.append(f" - 🔸 {issue}")
# Positive notes (corroboration, year-tolerance) — separate from issues.
notes = list(getattr(comp, "notes", []) or [])
if notes and not minimal:
lines.append(" - **Notes:**")
for note in notes:
lines.append(f" - 🟢 {note}")
# Relevance Status
if report.evaluations and not minimal:
lines.append("- **Relevance Analysis:**")
for eval_res in report.evaluations:
score_icon = "🟢" if eval_res.relevance_score >= 4 else ("🟡" if eval_res.relevance_score == 3 else "🔴")
lines.append(f" - {score_icon} **Score {eval_res.relevance_score}/5** ({eval_res.score_label})")
if eval_res.line_number:
lines.append(f" - Line {eval_res.line_number}")
lines.append(f" - *\"{eval_res.explanation}\"*")
lines.append("")
lines.append("---")
lines.append("")
return lines
def _generate_submission_section(self) -> list[str]:
"""Generate section for submission quality check results."""
lines = ["## 📋 Submission Quality Checks", ""]
# Template info
if self.template:
lines.append(f"**Conference Template:** {self.template.name}")
lines.append(f"**Page Limit:** {self.template.page_limit_review} (review) / {self.template.page_limit_camera} (camera-ready)")
if self.template.mandatory_sections:
lines.append(f"**Required Sections:** {', '.join(self.template.mandatory_sections)}")
lines.append("")
# Count by severity
errors = [r for r in self.submission_results if r.severity == CheckSeverity.ERROR and not r.passed]
warnings = [r for r in self.submission_results if r.severity == CheckSeverity.WARNING and not r.passed]
infos = [r for r in self.submission_results if r.severity == CheckSeverity.INFO and not r.passed]
# Summary
if errors or warnings or infos:
lines.append("| Severity | Count |")
lines.append("|----------|-------|")
if errors:
lines.append(f"| 🔴 **Errors** | {len(errors)} |")
if warnings:
lines.append(f"| 🟡 **Warnings** | {len(warnings)} |")
if infos:
lines.append(f"| 🔵 **Suggestions** | {len(infos)} |")
lines.append("")
else:
lines.append("🎉 **No submission issues found!**")
lines.append("")
return lines
# Group by checker
by_checker = {}
for result in self.submission_results:
if result.passed:
continue
if result.checker_name not in by_checker:
by_checker[result.checker_name] = []
by_checker[result.checker_name].append(result)
def _format_one(result) -> list[str]:
"""Render a single CheckResult — line number only, no file path,
no truncation. The HTML report follows the same convention."""
buf = [f"- {result.message}"]
if result.line_number:
buf.append(f" - Line {result.line_number}")
if result.line_content:
# Highlight the offending span if the checker provided one.
content = result.line_content
if getattr(result, "match_text", None) and result.match_text in content:
idx = content.index(result.match_text)
content = (content[:idx]
+ "**" + result.match_text + "**"
+ content[idx + len(result.match_text):])
buf.append(f" - `{content}`")
if result.suggestion:
buf.append(f" - 💡 *{result.suggestion}*")
return buf
# Display errors first
if errors:
lines.append("### 🔴 Critical Errors")
lines.append("")
for result in errors:
lines.extend(_format_one(result))
lines.append("")
# Display warnings
if warnings:
lines.append("### 🟡 Warnings")
lines.append("")
for result in warnings:
lines.extend(_format_one(result))
lines.append("")
# Display suggestions (collapsible)
if infos:
lines.append("### 🔵 Suggestions")
lines.append("")
lines.append("Click to view suggestions
")
lines.append("")
for result in infos:
lines.extend(_format_one(result))
lines.append("")
lines.append(" ")
lines.append("")
return lines
def _generate_footer(self) -> list[str]:
"""Generate report footer."""
return [
"",
"---",
f"Report generated by **BibGuard** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
]
def save(self, filepath: str):
"""Save report to file."""
content = self.generate()
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
def save_bibliography_report(self, filepath: str):
"""Generate and save bibliography-only report (all bib-related checks)."""
lines = []
# Header
lines.append("# Bibliography Validation Report")
lines.append("")
lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("")
lines.append("| Inputs | Count |")
lines.append("|--------|-------|")
lines.append(f"| **Bib File(s)** | {len(self.bib_files)} |")
lines.append(f"| **TeX File(s)** | {len(self.tex_files)} |")
lines.append("")
# Disclaimer
lines.extend(self._generate_disclaimer())
lines.append("")
# Summary - Bibliography only
total = len(self.entries)
verified = sum(1 for e in self.entries if self._is_verified(e))
issues = sum(1 for e in self.entries if self._has_issues(e))
has_usage = any(e.usage is not None for e in self.entries)
if has_usage:
used = sum(1 for e in self.entries if e.usage and e.usage.is_used)
unused = total - used
used_str = str(used)
unused_str = str(unused)
missing_str = str(len(self.missing_citations))
else:
used_str = "N/A"
unused_str = "N/A"
missing_str = "N/A"
if self.duplicate_groups is None:
dup_str = "N/A"
else:
dup_str = str(len(self.duplicate_groups))
lines.append("## 📊 Summary")
lines.append("")
lines.append("| Metric | Count |")
lines.append("|--------|-------|")
lines.append(f"| **Total Entries** | {total} |")
lines.append(f"| ✅ **Verified (Clean)** | {verified} |")
lines.append(f"| ⚠️ **With Issues** | {issues} |")
lines.append(f"| 📝 **Used in TeX** | {used_str} |")
lines.append(f"| 🗑️ **Unused** | {unused_str} |")
lines.append(f"| 🔄 **Duplicate Groups** | {dup_str} |")
lines.append(f"| ❌ **Missing Bib Entries** | {missing_str} |")
lines.append("")
# Issues section
lines.extend(self._generate_issues_section())
lines.append("")
# Verified entries
lines.extend(self._generate_verified_section())
lines.append("")
# Footer
lines.extend(self._generate_footer())
content = "\n".join(lines)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
# ------------------------------------------------------------------
# JSON + standalone HTML output
# ------------------------------------------------------------------
def build_payload(self) -> Dict[str, Any]:
"""Build the JSON-serializable payload used by JSON & HTML outputs."""
def _entry_dict(e: BibEntry) -> dict:
return {
"key": e.key, "entry_type": e.entry_type, "title": e.title,
"author": e.author, "year": e.year, "journal": e.journal,
"booktitle": e.booktitle, "publisher": e.publisher,
"doi": e.doi, "arxiv_id": e.arxiv_id, "url": e.url,
"volume": e.volume, "pages": e.pages,
}
def _comparison_dict(c: Optional[ComparisonResult]) -> Optional[dict]:
if c is None: return None
return {
"is_match": c.is_match, "confidence": c.confidence,
"title_match": c.title_match, "title_similarity": c.title_similarity,
"author_match": c.author_match, "author_similarity": c.author_similarity,
"year_match": c.year_match,
"bib_title": c.bib_title, "fetched_title": c.fetched_title,
"bib_authors": c.bib_authors, "fetched_authors": c.fetched_authors,
"bib_year": c.bib_year, "fetched_year": c.fetched_year,
"issues": list(c.issues), "source": c.source,
"notes": list(getattr(c, "notes", []) or []),
"published_version_hint": getattr(c, "published_version_hint", ""),
}
def _usage_dict(u: Optional[UsageResult]) -> Optional[dict]:
if u is None: return None
return {"is_used": u.is_used, "usage_count": getattr(u, "usage_count", 0)}
def _eval_dict(ev: EvaluationResult) -> dict:
return {
"entry_key": ev.entry_key,
"relevance_score": ev.relevance_score,
"is_relevant": ev.is_relevant,
"explanation": ev.explanation,
"citation_role": getattr(ev, "citation_role", ""),
"line_number": ev.line_number, "file_path": ev.file_path,
"error": ev.error,
}
entries_payload = []
for r in self.entries:
entries_payload.append({
"entry": _entry_dict(r.entry),
"comparison": _comparison_dict(r.comparison),
"usage": _usage_dict(r.usage),
"evaluations": [_eval_dict(ev) for ev in (r.evaluations or [])],
})
sub_payload = []
for r in self.submission_results:
sub_payload.append({
"checker": r.checker_name, "passed": r.passed,
"severity": r.severity.value if hasattr(r.severity, "value") else str(r.severity),
"message": r.message, "line_number": r.line_number,
"line_content": r.line_content, "suggestion": r.suggestion,
# file_path intentionally omitted — user-facing report should
# never expose local tex paths.
"match_text": getattr(r, "match_text", None),
})
retr_payload = []
for f in self.retraction_findings:
res = getattr(f, "result", None)
retr_payload.append({
"entry_key": getattr(f, "entry_key", ""),
"doi": getattr(f, "doi", ""),
"is_retracted": getattr(res, "is_retracted", False) if res else False,
"update_type": getattr(res, "update_type", "") if res else "",
"notice_doi": getattr(res, "notice_doi", "") if res else "",
"notice_label": getattr(res, "notice_label", "") if res else "",
"notice_url": getattr(res, "notice_url", "") if res else "",
})
url_payload = []
for f in self.url_findings:
url_payload.append({
"entry_key": getattr(f, "entry_key", ""),
"url": getattr(f, "url", ""),
"status": getattr(f, "status", ""),
"status_code": getattr(f, "status_code", None),
"detail": getattr(f, "detail", ""),
})
duplicates = []
for grp in (self.duplicate_groups or []):
keys = [getattr(e, "key", "") for e in getattr(grp, "entries", [])]
duplicates.append([k for k in keys if k])
bib_stats, latex_stats = self.get_summary_stats()
return {
"meta": {
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
# Counts only — never expose source filenames in any
# downstream artifact (HTML, JSON, anywhere else).
"bib_files_count": len(self.bib_files),
"tex_files_count": len(self.tex_files),
"template": getattr(self.template, "name", "") if self.template else "",
},
"summary": {"bibliography": bib_stats, "latex": latex_stats},
"entries": entries_payload,
"submission_results": sub_payload,
"retractions": retr_payload,
"url_findings": url_payload,
"duplicates": duplicates,
"missing_citations": list(self.missing_citations),
}
def save_json(self, filepath: str) -> None:
"""Write a machine-readable JSON dump of the full report."""
payload = self.build_payload()
with open(filepath, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2, default=_json_default)
def save_html(self, filepath: str) -> None:
"""Write a single self-contained HTML report (CSS+JS inlined)."""
payload = self.build_payload()
html = render_standalone_html(payload)
with open(filepath, "w", encoding="utf-8") as f:
f.write(html)
def save_latex_quality_report(self, filepath: str, submission_results: List[CheckResult], template=None):
"""Generate and save LaTeX quality report (all tex-related quality checks)."""
lines = []
# Header
lines.append("# LaTeX Quality Report")
lines.append("")
lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("")
lines.append(f"**Inputs:** {len(self.tex_files)} TeX file(s)")
lines.append("")
if template:
lines.append(f"**Template:** {template.name}")
lines.append("")
# Disclaimer
lines.append("> **⚠️ Note:** This report contains automated quality checks for your LaTeX document. Please review all suggestions carefully before making changes.")
lines.append("")
# Summary
error_count = sum(1 for r in submission_results if r.severity == CheckSeverity.ERROR)
warning_count = sum(1 for r in submission_results if r.severity == CheckSeverity.WARNING)
info_count = sum(1 for r in submission_results if r.severity == CheckSeverity.INFO)
lines.append("## 📊 Summary")
lines.append("")
lines.append("| Severity | Count |")
lines.append("|----------|-------|")
lines.append(f"| 🔴 **Errors** | {error_count} |")
lines.append(f"| 🟡 **Warnings** | {warning_count} |")
lines.append(f"| 🔵 **Suggestions** | {info_count} |")
lines.append("")
# Detailed issues
self.submission_results = submission_results
self.template = template
lines.extend(self._generate_submission_section())
lines.append("")
# Footer
lines.append("---")
lines.append("")
lines.append(f"Report generated by **BibGuard** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
content = "\n".join(lines)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)