File size: 3,709 Bytes
fcffa22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | """
Pre-flight validation for user-supplied .bib / .tex inputs.
Catch obvious problems (giant files, files that don't actually contain bibs/cites)
*before* spending five minutes on metadata fetches.
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import List
logger = logging.getLogger(__name__)
@dataclass
class ValidationReport:
ok: bool = True
errors: List[str] = field(default_factory=list)
warnings: List[str] = field(default_factory=list)
def add_error(self, msg: str) -> None:
self.errors.append(msg)
self.ok = False
def add_warning(self, msg: str) -> None:
self.warnings.append(msg)
# Sensible thresholds; tuned for typical CS/ML papers.
MAX_BIB_BYTES = 5 * 1024 * 1024 # 5 MB
MAX_BIB_ENTRIES = 5000
MAX_TEX_BYTES = 10 * 1024 * 1024 # 10 MB
_BIB_ENTRY = re.compile(r"^@\w+\s*\{", re.MULTILINE)
_TEX_HAS_CITES = re.compile(r"\\(?:cite|citep|citet|citeauthor|citeyear|nocite|parencite|textcite)\b")
_TEX_HAS_BIB = re.compile(r"\\(?:bibliography|addbibresource|printbibliography|bibitem)\b")
def validate_bib(path: Path) -> ValidationReport:
"""Pre-flight check on a .bib file."""
rep = ValidationReport()
if not path.exists():
rep.add_error(f"Bib file does not exist: {path}")
return rep
if not path.is_file():
rep.add_error(f"Not a file: {path}")
return rep
size = path.stat().st_size
if size == 0:
rep.add_error(f"Bib file is empty: {path}")
return rep
if size > MAX_BIB_BYTES:
rep.add_warning(
f".bib file is large ({size/1024/1024:.1f} MB). Metadata checks may be slow."
)
try:
text = path.read_text(encoding="utf-8", errors="replace")
except OSError as e:
rep.add_error(f"Cannot read bib file: {e}")
return rep
entries = _BIB_ENTRY.findall(text)
n = len(entries)
if n == 0:
rep.add_error(f"No bibtex entries (`@type{{...}}`) found in {path.name}.")
elif n > MAX_BIB_ENTRIES:
rep.add_warning(
f"{n} entries in {path.name}; metadata checks may take a long time."
)
return rep
def validate_tex(path: Path) -> ValidationReport:
"""Pre-flight check on a .tex file."""
rep = ValidationReport()
if not path.exists():
rep.add_error(f"TeX file does not exist: {path}")
return rep
if not path.is_file():
rep.add_error(f"Not a file: {path}")
return rep
size = path.stat().st_size
if size == 0:
rep.add_error(f"TeX file is empty: {path}")
return rep
if size > MAX_TEX_BYTES:
rep.add_warning(
f".tex file is large ({size/1024/1024:.1f} MB). Some checks scan whole content."
)
try:
text = path.read_text(encoding="utf-8", errors="replace")
except OSError as e:
rep.add_error(f"Cannot read tex file: {e}")
return rep
has_cite = bool(_TEX_HAS_CITES.search(text))
has_bib_decl = bool(_TEX_HAS_BIB.search(text))
if not (has_cite or has_bib_decl):
rep.add_warning(
f"{path.name} contains no \\cite{{...}} and no \\bibliography{{...}} — "
"BibGuard's bibliography checks won't find anything to verify."
)
return rep
def format_report(rep: ValidationReport, label: str = "") -> str:
"""Pretty-print a ValidationReport for stdout."""
parts = []
prefix = f"[{label}] " if label else ""
for e in rep.errors:
parts.append(f"{prefix}ERROR: {e}")
for w in rep.warnings:
parts.append(f"{prefix}WARN: {w}")
return "\n".join(parts)
|