Spaces:
Sleeping
Sleeping
| """ | |
| Report generator for bibliography check results. | |
| """ | |
| import json | |
| import re | |
| from dataclasses import asdict, dataclass, is_dataclass | |
| from datetime import datetime | |
| from typing import Any, Optional, List, Dict | |
| from pathlib import Path | |
| from ..parsers.bib_parser import BibEntry | |
| from ..analyzers.metadata_comparator import ComparisonResult | |
| from ..analyzers.usage_checker import UsageResult | |
| from ..analyzers.llm_evaluator import EvaluationResult | |
| from ..analyzers.duplicate_detector import DuplicateGroup | |
| from ..checkers.base import CheckResult, CheckSeverity | |
| from .html_report import render_standalone_html | |
| class EntryReport: | |
| """Complete report for a single bib entry.""" | |
| entry: BibEntry | |
| comparison: Optional[ComparisonResult] | |
| usage: Optional[UsageResult] | |
| evaluations: list[EvaluationResult] | |
| def _json_default(o): | |
| if is_dataclass(o): | |
| return asdict(o) | |
| if hasattr(o, "value"): | |
| return o.value | |
| return str(o) | |
| class ReportGenerator: | |
| """Generates formatted markdown reports.""" | |
| def __init__(self, minimal_verified: bool = False, check_preprint_ratio: bool = True, preprint_warning_threshold: float = 0.50): | |
| self.entries: list[EntryReport] = [] | |
| self.missing_citations: list[str] = [] | |
| self.duplicate_groups: list[DuplicateGroup] | None = None # None means check not run | |
| self.bib_files: list[str] = [] | |
| self.tex_files: list[str] = [] | |
| self.bib_file: str = "" # Keep for backward compatibility/single file | |
| self.tex_file: str = "" # Keep for backward compatibility/single file | |
| self.minimal_verified = minimal_verified # Whether to show minimal info for verified entries | |
| self.submission_results: List[CheckResult] = [] # Submission quality check results | |
| self.template = None # Conference template if used | |
| self.check_preprint_ratio = check_preprint_ratio # Whether to check preprint ratio | |
| self.preprint_warning_threshold = preprint_warning_threshold # Threshold for preprint warning | |
| self.retraction_findings: list = [] # F1 results | |
| self.url_findings: list = [] # F2 results | |
| def set_retraction_findings(self, findings) -> None: | |
| self.retraction_findings = list(findings or []) | |
| def set_url_findings(self, findings) -> None: | |
| self.url_findings = list(findings or []) | |
| def add_entry_report(self, report: EntryReport): | |
| """Add an entry report.""" | |
| self.entries.append(report) | |
| def set_metadata(self, bib_files: str | list[str], tex_files: str | list[str]): | |
| """Set source file information.""" | |
| if isinstance(bib_files, str): | |
| self.bib_files = [bib_files] | |
| self.bib_file = bib_files | |
| else: | |
| self.bib_files = bib_files | |
| self.bib_file = bib_files[0] if bib_files else "" | |
| if isinstance(tex_files, str): | |
| self.tex_files = [tex_files] | |
| self.tex_file = tex_files | |
| else: | |
| self.tex_files = tex_files | |
| self.tex_file = tex_files[0] if tex_files else "" | |
| def set_missing_citations(self, missing: list[str]): | |
| """Set list of citations without bib entries.""" | |
| self.missing_citations = missing | |
| def set_duplicate_groups(self, groups: list[DuplicateGroup]): | |
| """Set list of duplicate entry groups.""" | |
| self.duplicate_groups = groups | |
| def set_submission_results(self, results: List[CheckResult], template=None): | |
| """Set submission quality check results.""" | |
| self.submission_results = results | |
| self.template = template | |
| def generate(self) -> str: | |
| """Generate the full markdown report.""" | |
| lines = [] | |
| # Header | |
| lines.extend(self._generate_header()) | |
| lines.append("") | |
| # Disclaimer | |
| lines.extend(self._generate_disclaimer()) | |
| lines.append("") | |
| # Summary statistics | |
| lines.extend(self._generate_summary()) | |
| lines.append("") | |
| # ⚠️ Critical Issues (Detailed) - Bibliography-related issues | |
| lines.extend(self._generate_issues_section()) | |
| lines.append("") | |
| # ✅ Verified Entries (Clean) | |
| lines.extend(self._generate_verified_section()) | |
| lines.append("") | |
| # 📋 Submission Quality Checks (LaTeX quality checks) | |
| if self.submission_results: | |
| lines.extend(self._generate_submission_section()) | |
| lines.append("") | |
| # Footer | |
| lines.extend(self._generate_footer()) | |
| return "\n".join(lines) | |
| def get_summary_stats(self) -> tuple[dict, dict]: | |
| """Get summary statistics as dictionaries for console display (Issues only).""" | |
| total = len(self.entries) | |
| # Bibliography issues breakdown | |
| title_mismatches = 0 | |
| author_mismatches = 0 | |
| year_mismatches = 0 | |
| low_relevance = 0 | |
| unable_to_verify = 0 | |
| for e in self.entries: | |
| # Metadata issues | |
| if e.comparison: | |
| if e.comparison.has_issues: | |
| # Categorize issues | |
| has_title = False | |
| has_author = False | |
| has_year = False | |
| for issue in e.comparison.issues: | |
| if "Title mismatch" in issue: has_title = True | |
| elif "Author mismatch" in issue: has_author = True | |
| elif "Year mismatch" in issue: has_year = True | |
| elif "Unable to find" in issue: unable_to_verify += 1 | |
| if has_title: title_mismatches += 1 | |
| if has_author: author_mismatches += 1 | |
| if has_year: year_mismatches += 1 | |
| # Relevance issues | |
| if any(ev.relevance_score <= 2 for ev in e.evaluations): | |
| low_relevance += 1 | |
| bib_stats = {} | |
| if title_mismatches > 0: bib_stats["Title Mismatches"] = title_mismatches | |
| if author_mismatches > 0: bib_stats["Author Mismatches"] = author_mismatches | |
| if year_mismatches > 0: bib_stats["Year Mismatches"] = year_mismatches | |
| if low_relevance > 0: bib_stats["Low Relevance"] = low_relevance | |
| if unable_to_verify > 0: bib_stats["Unable to Verify"] = unable_to_verify | |
| if self.duplicate_groups: | |
| bib_stats["Duplicate Groups"] = len(self.duplicate_groups) | |
| if self.missing_citations: | |
| bib_stats["Missing Bib Entries"] = len(self.missing_citations) | |
| unused = [e for e in self.entries if e.usage and not e.usage.is_used] | |
| if unused: | |
| bib_stats["Unused Entries"] = len(unused) | |
| # LaTeX stats - Group by precise Rule Names | |
| latex_stats = {} | |
| # Rule mapping for professional display names | |
| RULE_MAPPING = { | |
| "Very long sentence": "Sentence Length (Critical)", | |
| "Long sentence": "Sentence Length (Warning)", | |
| "Possible Markdown bullet point": "Markdown Bullet Point", | |
| "Possible Markdown numbered list": "Markdown Numbered List", | |
| "Possible Markdown italic": "Markdown Italic", | |
| "Possible Markdown bold": "Markdown Bold", | |
| "Inconsistent hyphenation": "Hyphenation Inconsistency", | |
| "Inconsistent spelling": "Spelling Inconsistency", | |
| "Unreferenced figure": "Unreferenced Figure", | |
| "Unreferenced table": "Unreferenced Table", | |
| "Unreferenced section": "Unreferenced Section", | |
| "Unreferenced label": "Unreferenced Label", | |
| "Citation from": "Old Citation (10+ years)", | |
| "Hedging language": "Hedging/Vague Language", | |
| "Redundant phrase": "Redundant Phrasing", | |
| "Weak start with": "Weak Sentence Starter", | |
| "Unescaped &": "Unescaped Special Character", | |
| "Citation without non-breaking space": "Missing Non-breaking Space (~)", | |
| "Mixed citation styles": "Mixed Citation Styles", | |
| "Mixed inline math": "Mixed Math Notation", | |
| "Appendix section": "Unreferenced Appendix", | |
| "Missing space before unit": "Unit Spacing Issue" | |
| } | |
| for r in self.submission_results: | |
| if r.passed: | |
| continue | |
| raw_msg = r.message | |
| rule_name = "Unknown Rule" | |
| # Match against our professional rule names | |
| matched = False | |
| for pattern, official_name in RULE_MAPPING.items(): | |
| if pattern in raw_msg: | |
| rule_name = official_name | |
| matched = True | |
| break | |
| if not matched: | |
| # Fallback: Clean the message (remove dynamic parts) | |
| clean_msg = re.sub(r"\(.*?\)", "", raw_msg) | |
| clean_msg = re.sub(r"'.*?'", "", clean_msg) | |
| clean_msg = re.sub(r"\d+", "", clean_msg) | |
| rule_name = clean_msg.split(":")[0].strip() | |
| if rule_name not in latex_stats: | |
| latex_stats[rule_name] = 0 | |
| latex_stats[rule_name] += 1 | |
| return bib_stats, latex_stats | |
| def generate_console_output(self) -> str: | |
| """Generate console-friendly output (Summary + Issues only).""" | |
| lines = [] | |
| # Summary statistics | |
| lines.extend(self._generate_summary()) | |
| lines.append("") | |
| # Critical Issues | |
| lines.extend(self._generate_issues_section()) | |
| lines.append("") | |
| return "\n".join(lines) | |
| def _generate_header(self) -> list[str]: | |
| """Generate report header. | |
| File names are intentionally not printed — keep the report | |
| portable, and never expose local source paths to anyone the | |
| report is shared with. | |
| """ | |
| timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
| return [ | |
| "# Bibliography Validation Report", | |
| "", | |
| f"**Generated:** {timestamp}", | |
| "", | |
| "| Inputs | Count |", | |
| "|--------|-------|", | |
| f"| **Bib File(s)** | {len(self.bib_files)} |", | |
| f"| **TeX File(s)** | {len(self.tex_files)} |", | |
| ] | |
| def _generate_disclaimer(self) -> list[str]: | |
| """Generate disclaimer section.""" | |
| return [ | |
| "> **⚠️ Disclaimer:** This report is generated by an automated tool. While BibGuard strives for accuracy, it may produce false positives or miss certain issues. **This tool cannot replace human review.** Please manually verify all reported issues before making changes to your bibliography." | |
| ] | |
| def _generate_summary(self) -> list[str]: | |
| """Generate summary statistics.""" | |
| total = len(self.entries) | |
| # Check availability of results | |
| has_metadata = any(e.comparison is not None for e in self.entries) | |
| has_usage = any(e.usage is not None for e in self.entries) | |
| has_eval = any(len(e.evaluations) > 0 for e in self.entries) | |
| # Calculate Verified/Issues | |
| # Note: _is_verified depends on _has_issues. | |
| # If a check wasn't run, it won't contribute to issues. | |
| verified = sum(1 for e in self.entries if self._is_verified(e)) | |
| issues = sum(1 for e in self.entries if self._has_issues(e)) | |
| # Usage stats | |
| if has_usage: | |
| used = sum(1 for e in self.entries if e.usage and e.usage.is_used) | |
| unused = total - used | |
| used_str = str(used) | |
| unused_str = str(unused) | |
| missing_str = str(len(self.missing_citations)) | |
| else: | |
| used_str = "N/A" | |
| unused_str = "N/A" | |
| missing_str = "N/A" | |
| # Duplicate stats - show N/A if check wasn't run (duplicate_groups is None means not checked) | |
| if self.duplicate_groups is None: | |
| dup_str = "N/A" | |
| else: | |
| dup_str = str(len(self.duplicate_groups)) | |
| # Preprint detection (only if enabled) | |
| preprint_str = "N/A" | |
| preprint_warning = [] | |
| if self.check_preprint_ratio and has_usage: | |
| used_entries = [e for e in self.entries if e.usage and e.usage.is_used] | |
| if used_entries: | |
| preprint_count = sum(1 for e in used_entries if self._is_preprint(e.entry)) | |
| preprint_ratio = preprint_count / len(used_entries) | |
| preprint_str = f"{preprint_count} ({preprint_ratio:.1%})" | |
| # Warning if exceeds threshold | |
| if preprint_ratio > self.preprint_warning_threshold: | |
| preprint_warning = [ | |
| "", | |
| f"> ⚠️ **High Preprint Ratio Warning:** {preprint_ratio:.1%} of your used references are preprints (arXiv, bioRxiv, etc.). Consider replacing some with peer-reviewed publications if available." | |
| ] | |
| summary_lines = [ | |
| "## 📊 Summary", | |
| "", | |
| "### 📚 Bibliography Statistics", | |
| "", | |
| "| Metric | Count |", | |
| "|--------|-------|", | |
| f"| **Total Entries** | {total} |", | |
| f"| ✅ **Verified (Clean)** | {verified} |", | |
| f"| ⚠️ **With Issues** | {issues} |", | |
| f"| 📝 **Used in TeX** | {used_str} |", | |
| f"| 🗑️ **Unused** | {unused_str} |", | |
| f"| 🔄 **Duplicate Groups** | {dup_str} |", | |
| f"| ❌ **Missing Bib Entries** | {missing_str} |", | |
| f"| 📄 **Preprints (Used)** | {preprint_str} |", | |
| ] | |
| # Add warning if needed | |
| if preprint_warning: | |
| summary_lines.extend(preprint_warning) | |
| summary_lines.extend([ | |
| "", | |
| "### 📋 LaTeX Quality Checks", | |
| "", | |
| self._get_submission_summary() | |
| ]) | |
| return summary_lines | |
| def _is_preprint(self, entry: BibEntry) -> bool: | |
| """Check if an entry is a preprint.""" | |
| # Preprint indicators | |
| preprint_keywords = [ | |
| 'arxiv', 'biorxiv', 'medrxiv', 'ssrn', 'preprint', | |
| 'openreview', 'techreport', 'technical report', 'working paper', | |
| 'tech report', 'tech. report' | |
| ] | |
| # Check entry type | |
| if entry.entry_type.lower() in ['techreport', 'unpublished', 'misc']: | |
| # Further check if it's actually a preprint | |
| text_to_check = ' '.join([ | |
| entry.journal.lower(), | |
| entry.booktitle.lower(), | |
| entry.publisher.lower(), | |
| entry.entry_type.lower() | |
| ]) | |
| if any(keyword in text_to_check for keyword in preprint_keywords): | |
| return True | |
| # Check if arXiv ID exists | |
| if entry.has_arxiv: | |
| return True | |
| # Check journal/booktitle/publisher fields | |
| venue_text = ' '.join([ | |
| entry.journal.lower(), | |
| entry.booktitle.lower(), | |
| entry.publisher.lower() | |
| ]) | |
| return any(keyword in venue_text for keyword in preprint_keywords) | |
| def _get_submission_summary(self) -> str: | |
| """Generate submission quality summary table.""" | |
| if not self.submission_results: | |
| return "*No quality checks were performed.*" | |
| # Count by severity | |
| error_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.ERROR) | |
| warning_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.WARNING) | |
| info_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.INFO) | |
| lines = [ | |
| "| Severity | Count |", | |
| "|----------|-------|", | |
| f"| 🔴 **Errors** | {error_count} |", | |
| f"| 🟡 **Warnings** | {warning_count} |", | |
| f"| 🔵 **Suggestions** | {info_count} |" | |
| ] | |
| return "\n".join(lines) | |
| def _is_verified(self, entry: EntryReport) -> bool: | |
| """Check if entry is clean (no issues).""" | |
| return not self._has_issues(entry) | |
| def _has_issues(self, entry: EntryReport) -> bool: | |
| """Check if entry has any issues.""" | |
| # Metadata issues | |
| if entry.comparison and entry.comparison.has_issues: | |
| return True | |
| # LLM issues (low relevance) | |
| if any(ev.relevance_score <= 2 for ev in entry.evaluations): | |
| return True | |
| # NOTE: We don't include usage issues (unused) here because | |
| # unused entries are already shown in the "Unused Entries" section | |
| return False | |
| def _has_metadata_or_relevance_issues(self, entry: EntryReport) -> bool: | |
| """Check if entry has metadata or relevance issues (excluding duplicate/unused).""" | |
| # Metadata issues | |
| if entry.comparison and entry.comparison.has_issues: | |
| return True | |
| # LLM issues (low relevance) | |
| if any(ev.relevance_score <= 2 for ev in entry.evaluations): | |
| return True | |
| return False | |
| def _generate_issues_section(self) -> list[str]: | |
| """Generate detailed section for entries with issues.""" | |
| lines = ["## ⚠️ Critical Issues Detected", ""] | |
| has_any_issues = False | |
| # 1. Missing Citations | |
| if self.missing_citations: | |
| has_any_issues = True | |
| lines.append("### ❌ Missing Bibliography Entries") | |
| lines.append("The following keys are cited in the TeX file but missing from the .bib file:") | |
| lines.append("") | |
| for key in self.missing_citations: | |
| lines.append(f"- `{key}`") | |
| lines.append("") | |
| # 2. Duplicate Entries | |
| if self.duplicate_groups: | |
| has_any_issues = True | |
| lines.append("### 🔄 Duplicate Entries") | |
| for i, group in enumerate(self.duplicate_groups, 1): | |
| lines.append(f"#### Group {i} (Similarity: {group.similarity_score:.0%})") | |
| lines.append(f"**Reason:** {group.reason}") | |
| lines.append("") | |
| lines.append("| Key | Title | Year |") | |
| lines.append("|-----|-------|------|") | |
| for entry in group.entries: | |
| lines.append(f"| `{entry.key}` | {entry.title} | {entry.year} |") | |
| lines.append("") | |
| # 3. Unused Entries | |
| unused = [e for e in self.entries if e.usage and not e.usage.is_used] | |
| if unused: | |
| has_any_issues = True | |
| lines.append("### 🗑️ Unused Entries") | |
| lines.append("The following entries are in the .bib file but NOT cited in the TeX file:") | |
| lines.append("") | |
| for e in unused: | |
| lines.append(f"- `{e.entry.key}`: *{e.entry.title}*") | |
| lines.append("") | |
| # 4. Metadata Mismatches & Low Relevance | |
| issue_entries = [e for e in self.entries if self._has_metadata_or_relevance_issues(e)] | |
| if issue_entries: | |
| has_any_issues = True | |
| lines.append("### ⚠️ Metadata & Relevance Issues") | |
| for entry_report in issue_entries: | |
| lines.extend(self._format_entry_detail(entry_report, is_verified=False)) | |
| if not has_any_issues: | |
| lines.append("🎉 **No critical issues found!**") | |
| return lines | |
| def _generate_verified_section(self) -> list[str]: | |
| """Generate section for verified entries.""" | |
| lines = ["## ✅ Verified Entries", ""] | |
| verified = [e for e in self.entries if self._is_verified(e)] | |
| if not verified: | |
| lines.append("_No verified entries found._") | |
| return lines | |
| lines.append(f"Found **{len(verified)}** entries with correct metadata.") | |
| lines.append("") | |
| # Use a collapsible details block for clean UI | |
| lines.append("<details>") | |
| lines.append("<summary>Click to view verified entries</summary>") | |
| lines.append("") | |
| for entry_report in verified: | |
| lines.extend(self._format_entry_detail(entry_report, minimal=self.minimal_verified, is_verified=True)) | |
| lines.append("</details>") | |
| return lines | |
| def _format_entry_detail(self, report: EntryReport, minimal: bool = False, is_verified: bool = False) -> list[str]: | |
| """Format a single entry report in Markdown.""" | |
| entry = report.entry | |
| comp = report.comparison | |
| lines = [] | |
| # Title header - use checkmark for verified entries, warning for issues | |
| icon = "✅" if is_verified else "⚠️" | |
| lines.append(f"#### {icon} `{entry.key}`") | |
| lines.append(f"**Title:** {entry.title}") | |
| lines.append("") | |
| # Metadata Status | |
| if comp: | |
| status_icon = "✅" if comp.is_match else "❌" | |
| lines.append(f"- **Metadata Status:** {status_icon} {comp.source.upper()} (Confidence: {comp.confidence:.1%})") | |
| if comp.has_issues and not minimal: | |
| lines.append(" - **Discrepancies:**") | |
| for issue in comp.issues: | |
| # Format mismatch details nicely | |
| if "Mismatch" in issue or "mismatch" in issue: | |
| lines.append(f" - 🔴 {issue}") | |
| if "Title" in issue: | |
| lines.append(f" - **Bib:** `{comp.bib_title}`") | |
| lines.append(f" - **Fetched:** `{comp.fetched_title}`") | |
| elif "Author" in issue: | |
| lines.append(f" - **Bib:** `{', '.join(comp.bib_authors)}`") | |
| lines.append(f" - **Fetched:** `{', '.join(comp.fetched_authors)}`") | |
| else: | |
| lines.append(f" - 🔸 {issue}") | |
| # Positive notes (corroboration, year-tolerance) — separate from issues. | |
| notes = list(getattr(comp, "notes", []) or []) | |
| if notes and not minimal: | |
| lines.append(" - **Notes:**") | |
| for note in notes: | |
| lines.append(f" - 🟢 {note}") | |
| # Relevance Status | |
| if report.evaluations and not minimal: | |
| lines.append("- **Relevance Analysis:**") | |
| for eval_res in report.evaluations: | |
| score_icon = "🟢" if eval_res.relevance_score >= 4 else ("🟡" if eval_res.relevance_score == 3 else "🔴") | |
| lines.append(f" - {score_icon} **Score {eval_res.relevance_score}/5** ({eval_res.score_label})") | |
| if eval_res.line_number: | |
| lines.append(f" - Line {eval_res.line_number}") | |
| lines.append(f" - *\"{eval_res.explanation}\"*") | |
| lines.append("") | |
| lines.append("---") | |
| lines.append("") | |
| return lines | |
| def _generate_submission_section(self) -> list[str]: | |
| """Generate section for submission quality check results.""" | |
| lines = ["## 📋 Submission Quality Checks", ""] | |
| # Template info | |
| if self.template: | |
| lines.append(f"**Conference Template:** {self.template.name}") | |
| lines.append(f"**Page Limit:** {self.template.page_limit_review} (review) / {self.template.page_limit_camera} (camera-ready)") | |
| if self.template.mandatory_sections: | |
| lines.append(f"**Required Sections:** {', '.join(self.template.mandatory_sections)}") | |
| lines.append("") | |
| # Count by severity | |
| errors = [r for r in self.submission_results if r.severity == CheckSeverity.ERROR and not r.passed] | |
| warnings = [r for r in self.submission_results if r.severity == CheckSeverity.WARNING and not r.passed] | |
| infos = [r for r in self.submission_results if r.severity == CheckSeverity.INFO and not r.passed] | |
| # Summary | |
| if errors or warnings or infos: | |
| lines.append("| Severity | Count |") | |
| lines.append("|----------|-------|") | |
| if errors: | |
| lines.append(f"| 🔴 **Errors** | {len(errors)} |") | |
| if warnings: | |
| lines.append(f"| 🟡 **Warnings** | {len(warnings)} |") | |
| if infos: | |
| lines.append(f"| 🔵 **Suggestions** | {len(infos)} |") | |
| lines.append("") | |
| else: | |
| lines.append("🎉 **No submission issues found!**") | |
| lines.append("") | |
| return lines | |
| # Group by checker | |
| by_checker = {} | |
| for result in self.submission_results: | |
| if result.passed: | |
| continue | |
| if result.checker_name not in by_checker: | |
| by_checker[result.checker_name] = [] | |
| by_checker[result.checker_name].append(result) | |
| def _format_one(result) -> list[str]: | |
| """Render a single CheckResult — line number only, no file path, | |
| no truncation. The HTML report follows the same convention.""" | |
| buf = [f"- {result.message}"] | |
| if result.line_number: | |
| buf.append(f" - Line {result.line_number}") | |
| if result.line_content: | |
| # Highlight the offending span if the checker provided one. | |
| content = result.line_content | |
| if getattr(result, "match_text", None) and result.match_text in content: | |
| idx = content.index(result.match_text) | |
| content = (content[:idx] | |
| + "**" + result.match_text + "**" | |
| + content[idx + len(result.match_text):]) | |
| buf.append(f" - `{content}`") | |
| if result.suggestion: | |
| buf.append(f" - 💡 *{result.suggestion}*") | |
| return buf | |
| # Display errors first | |
| if errors: | |
| lines.append("### 🔴 Critical Errors") | |
| lines.append("") | |
| for result in errors: | |
| lines.extend(_format_one(result)) | |
| lines.append("") | |
| # Display warnings | |
| if warnings: | |
| lines.append("### 🟡 Warnings") | |
| lines.append("") | |
| for result in warnings: | |
| lines.extend(_format_one(result)) | |
| lines.append("") | |
| # Display suggestions (collapsible) | |
| if infos: | |
| lines.append("### 🔵 Suggestions") | |
| lines.append("<details>") | |
| lines.append("<summary>Click to view suggestions</summary>") | |
| lines.append("") | |
| for result in infos: | |
| lines.extend(_format_one(result)) | |
| lines.append("") | |
| lines.append("</details>") | |
| lines.append("") | |
| return lines | |
| def _generate_footer(self) -> list[str]: | |
| """Generate report footer.""" | |
| return [ | |
| "", | |
| "---", | |
| f"Report generated by **BibGuard** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" | |
| ] | |
| def save(self, filepath: str): | |
| """Save report to file.""" | |
| content = self.generate() | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| def save_bibliography_report(self, filepath: str): | |
| """Generate and save bibliography-only report (all bib-related checks).""" | |
| lines = [] | |
| # Header | |
| lines.append("# Bibliography Validation Report") | |
| lines.append("") | |
| lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| lines.append("") | |
| lines.append("| Inputs | Count |") | |
| lines.append("|--------|-------|") | |
| lines.append(f"| **Bib File(s)** | {len(self.bib_files)} |") | |
| lines.append(f"| **TeX File(s)** | {len(self.tex_files)} |") | |
| lines.append("") | |
| # Disclaimer | |
| lines.extend(self._generate_disclaimer()) | |
| lines.append("") | |
| # Summary - Bibliography only | |
| total = len(self.entries) | |
| verified = sum(1 for e in self.entries if self._is_verified(e)) | |
| issues = sum(1 for e in self.entries if self._has_issues(e)) | |
| has_usage = any(e.usage is not None for e in self.entries) | |
| if has_usage: | |
| used = sum(1 for e in self.entries if e.usage and e.usage.is_used) | |
| unused = total - used | |
| used_str = str(used) | |
| unused_str = str(unused) | |
| missing_str = str(len(self.missing_citations)) | |
| else: | |
| used_str = "N/A" | |
| unused_str = "N/A" | |
| missing_str = "N/A" | |
| if self.duplicate_groups is None: | |
| dup_str = "N/A" | |
| else: | |
| dup_str = str(len(self.duplicate_groups)) | |
| lines.append("## 📊 Summary") | |
| lines.append("") | |
| lines.append("| Metric | Count |") | |
| lines.append("|--------|-------|") | |
| lines.append(f"| **Total Entries** | {total} |") | |
| lines.append(f"| ✅ **Verified (Clean)** | {verified} |") | |
| lines.append(f"| ⚠️ **With Issues** | {issues} |") | |
| lines.append(f"| 📝 **Used in TeX** | {used_str} |") | |
| lines.append(f"| 🗑️ **Unused** | {unused_str} |") | |
| lines.append(f"| 🔄 **Duplicate Groups** | {dup_str} |") | |
| lines.append(f"| ❌ **Missing Bib Entries** | {missing_str} |") | |
| lines.append("") | |
| # Issues section | |
| lines.extend(self._generate_issues_section()) | |
| lines.append("") | |
| # Verified entries | |
| lines.extend(self._generate_verified_section()) | |
| lines.append("") | |
| # Footer | |
| lines.extend(self._generate_footer()) | |
| content = "\n".join(lines) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| # ------------------------------------------------------------------ | |
| # JSON + standalone HTML output | |
| # ------------------------------------------------------------------ | |
| def build_payload(self) -> Dict[str, Any]: | |
| """Build the JSON-serializable payload used by JSON & HTML outputs.""" | |
| def _entry_dict(e: BibEntry) -> dict: | |
| return { | |
| "key": e.key, "entry_type": e.entry_type, "title": e.title, | |
| "author": e.author, "year": e.year, "journal": e.journal, | |
| "booktitle": e.booktitle, "publisher": e.publisher, | |
| "doi": e.doi, "arxiv_id": e.arxiv_id, "url": e.url, | |
| "volume": e.volume, "pages": e.pages, | |
| } | |
| def _comparison_dict(c: Optional[ComparisonResult]) -> Optional[dict]: | |
| if c is None: return None | |
| return { | |
| "is_match": c.is_match, "confidence": c.confidence, | |
| "title_match": c.title_match, "title_similarity": c.title_similarity, | |
| "author_match": c.author_match, "author_similarity": c.author_similarity, | |
| "year_match": c.year_match, | |
| "bib_title": c.bib_title, "fetched_title": c.fetched_title, | |
| "bib_authors": c.bib_authors, "fetched_authors": c.fetched_authors, | |
| "bib_year": c.bib_year, "fetched_year": c.fetched_year, | |
| "issues": list(c.issues), "source": c.source, | |
| "notes": list(getattr(c, "notes", []) or []), | |
| "published_version_hint": getattr(c, "published_version_hint", ""), | |
| } | |
| def _usage_dict(u: Optional[UsageResult]) -> Optional[dict]: | |
| if u is None: return None | |
| return {"is_used": u.is_used, "usage_count": getattr(u, "usage_count", 0)} | |
| def _eval_dict(ev: EvaluationResult) -> dict: | |
| return { | |
| "entry_key": ev.entry_key, | |
| "relevance_score": ev.relevance_score, | |
| "is_relevant": ev.is_relevant, | |
| "explanation": ev.explanation, | |
| "citation_role": getattr(ev, "citation_role", ""), | |
| "line_number": ev.line_number, "file_path": ev.file_path, | |
| "error": ev.error, | |
| } | |
| entries_payload = [] | |
| for r in self.entries: | |
| entries_payload.append({ | |
| "entry": _entry_dict(r.entry), | |
| "comparison": _comparison_dict(r.comparison), | |
| "usage": _usage_dict(r.usage), | |
| "evaluations": [_eval_dict(ev) for ev in (r.evaluations or [])], | |
| }) | |
| sub_payload = [] | |
| for r in self.submission_results: | |
| sub_payload.append({ | |
| "checker": r.checker_name, "passed": r.passed, | |
| "severity": r.severity.value if hasattr(r.severity, "value") else str(r.severity), | |
| "message": r.message, "line_number": r.line_number, | |
| "line_content": r.line_content, "suggestion": r.suggestion, | |
| # file_path intentionally omitted — user-facing report should | |
| # never expose local tex paths. | |
| "match_text": getattr(r, "match_text", None), | |
| }) | |
| retr_payload = [] | |
| for f in self.retraction_findings: | |
| res = getattr(f, "result", None) | |
| retr_payload.append({ | |
| "entry_key": getattr(f, "entry_key", ""), | |
| "doi": getattr(f, "doi", ""), | |
| "is_retracted": getattr(res, "is_retracted", False) if res else False, | |
| "update_type": getattr(res, "update_type", "") if res else "", | |
| "notice_doi": getattr(res, "notice_doi", "") if res else "", | |
| "notice_label": getattr(res, "notice_label", "") if res else "", | |
| "notice_url": getattr(res, "notice_url", "") if res else "", | |
| }) | |
| url_payload = [] | |
| for f in self.url_findings: | |
| url_payload.append({ | |
| "entry_key": getattr(f, "entry_key", ""), | |
| "url": getattr(f, "url", ""), | |
| "status": getattr(f, "status", ""), | |
| "status_code": getattr(f, "status_code", None), | |
| "detail": getattr(f, "detail", ""), | |
| }) | |
| duplicates = [] | |
| for grp in (self.duplicate_groups or []): | |
| keys = [getattr(e, "key", "") for e in getattr(grp, "entries", [])] | |
| duplicates.append([k for k in keys if k]) | |
| bib_stats, latex_stats = self.get_summary_stats() | |
| return { | |
| "meta": { | |
| "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| # Counts only — never expose source filenames in any | |
| # downstream artifact (HTML, JSON, anywhere else). | |
| "bib_files_count": len(self.bib_files), | |
| "tex_files_count": len(self.tex_files), | |
| "template": getattr(self.template, "name", "") if self.template else "", | |
| }, | |
| "summary": {"bibliography": bib_stats, "latex": latex_stats}, | |
| "entries": entries_payload, | |
| "submission_results": sub_payload, | |
| "retractions": retr_payload, | |
| "url_findings": url_payload, | |
| "duplicates": duplicates, | |
| "missing_citations": list(self.missing_citations), | |
| } | |
| def save_json(self, filepath: str) -> None: | |
| """Write a machine-readable JSON dump of the full report.""" | |
| payload = self.build_payload() | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| json.dump(payload, f, ensure_ascii=False, indent=2, default=_json_default) | |
| def save_html(self, filepath: str) -> None: | |
| """Write a single self-contained HTML report (CSS+JS inlined).""" | |
| payload = self.build_payload() | |
| html = render_standalone_html(payload) | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write(html) | |
| def save_latex_quality_report(self, filepath: str, submission_results: List[CheckResult], template=None): | |
| """Generate and save LaTeX quality report (all tex-related quality checks).""" | |
| lines = [] | |
| # Header | |
| lines.append("# LaTeX Quality Report") | |
| lines.append("") | |
| lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| lines.append("") | |
| lines.append(f"**Inputs:** {len(self.tex_files)} TeX file(s)") | |
| lines.append("") | |
| if template: | |
| lines.append(f"**Template:** {template.name}") | |
| lines.append("") | |
| # Disclaimer | |
| lines.append("> **⚠️ Note:** This report contains automated quality checks for your LaTeX document. Please review all suggestions carefully before making changes.") | |
| lines.append("") | |
| # Summary | |
| error_count = sum(1 for r in submission_results if r.severity == CheckSeverity.ERROR) | |
| warning_count = sum(1 for r in submission_results if r.severity == CheckSeverity.WARNING) | |
| info_count = sum(1 for r in submission_results if r.severity == CheckSeverity.INFO) | |
| lines.append("## 📊 Summary") | |
| lines.append("") | |
| lines.append("| Severity | Count |") | |
| lines.append("|----------|-------|") | |
| lines.append(f"| 🔴 **Errors** | {error_count} |") | |
| lines.append(f"| 🟡 **Warnings** | {warning_count} |") | |
| lines.append(f"| 🔵 **Suggestions** | {info_count} |") | |
| lines.append("") | |
| # Detailed issues | |
| self.submission_results = submission_results | |
| self.template = template | |
| lines.extend(self._generate_submission_section()) | |
| lines.append("") | |
| # Footer | |
| lines.append("---") | |
| lines.append("") | |
| lines.append(f"Report generated by **BibGuard** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| content = "\n".join(lines) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| f.write(content) | |