BibGuard / src /utils /validation.py
thinkwee
v2.0
fcffa22
"""
Pre-flight validation for user-supplied .bib / .tex inputs.
Catch obvious problems (giant files, files that don't actually contain bibs/cites)
*before* spending five minutes on metadata fetches.
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import List
logger = logging.getLogger(__name__)
@dataclass
class ValidationReport:
ok: bool = True
errors: List[str] = field(default_factory=list)
warnings: List[str] = field(default_factory=list)
def add_error(self, msg: str) -> None:
self.errors.append(msg)
self.ok = False
def add_warning(self, msg: str) -> None:
self.warnings.append(msg)
# Sensible thresholds; tuned for typical CS/ML papers.
MAX_BIB_BYTES = 5 * 1024 * 1024 # 5 MB
MAX_BIB_ENTRIES = 5000
MAX_TEX_BYTES = 10 * 1024 * 1024 # 10 MB
_BIB_ENTRY = re.compile(r"^@\w+\s*\{", re.MULTILINE)
_TEX_HAS_CITES = re.compile(r"\\(?:cite|citep|citet|citeauthor|citeyear|nocite|parencite|textcite)\b")
_TEX_HAS_BIB = re.compile(r"\\(?:bibliography|addbibresource|printbibliography|bibitem)\b")
def validate_bib(path: Path) -> ValidationReport:
"""Pre-flight check on a .bib file."""
rep = ValidationReport()
if not path.exists():
rep.add_error(f"Bib file does not exist: {path}")
return rep
if not path.is_file():
rep.add_error(f"Not a file: {path}")
return rep
size = path.stat().st_size
if size == 0:
rep.add_error(f"Bib file is empty: {path}")
return rep
if size > MAX_BIB_BYTES:
rep.add_warning(
f".bib file is large ({size/1024/1024:.1f} MB). Metadata checks may be slow."
)
try:
text = path.read_text(encoding="utf-8", errors="replace")
except OSError as e:
rep.add_error(f"Cannot read bib file: {e}")
return rep
entries = _BIB_ENTRY.findall(text)
n = len(entries)
if n == 0:
rep.add_error(f"No bibtex entries (`@type{{...}}`) found in {path.name}.")
elif n > MAX_BIB_ENTRIES:
rep.add_warning(
f"{n} entries in {path.name}; metadata checks may take a long time."
)
return rep
def validate_tex(path: Path) -> ValidationReport:
"""Pre-flight check on a .tex file."""
rep = ValidationReport()
if not path.exists():
rep.add_error(f"TeX file does not exist: {path}")
return rep
if not path.is_file():
rep.add_error(f"Not a file: {path}")
return rep
size = path.stat().st_size
if size == 0:
rep.add_error(f"TeX file is empty: {path}")
return rep
if size > MAX_TEX_BYTES:
rep.add_warning(
f".tex file is large ({size/1024/1024:.1f} MB). Some checks scan whole content."
)
try:
text = path.read_text(encoding="utf-8", errors="replace")
except OSError as e:
rep.add_error(f"Cannot read tex file: {e}")
return rep
has_cite = bool(_TEX_HAS_CITES.search(text))
has_bib_decl = bool(_TEX_HAS_BIB.search(text))
if not (has_cite or has_bib_decl):
rep.add_warning(
f"{path.name} contains no \\cite{{...}} and no \\bibliography{{...}} — "
"BibGuard's bibliography checks won't find anything to verify."
)
return rep
def format_report(rep: ValidationReport, label: str = "") -> str:
"""Pretty-print a ValidationReport for stdout."""
parts = []
prefix = f"[{label}] " if label else ""
for e in rep.errors:
parts.append(f"{prefix}ERROR: {e}")
for w in rep.warnings:
parts.append(f"{prefix}WARN: {w}")
return "\n".join(parts)