Spaces:

thinkwee
/

BibGuard

Running

BibGuard / src /checkers /template_checker.py

thinkwee

v2.0

fcffa22 11 days ago

17.3 kB

	"""
	Conference-template conformance checker.

	Reads the rich rule set defined in :mod:`src.templates.base_template` and runs
	per-venue checks against the LaTeX source. Each rule fragment lives in its own
	small private method so adding new conferences (or new rules) doesn't bloat the
	public ``check`` method.

	Severity convention used here:

	* ``ERROR`` — desk-reject material if uncorrected (NeurIPS missing checklist,
	ACL missing Limitations, double-blind \\author leak).
	* ``WARNING`` — likely a real problem but might be a false positive (style
	package mismatch, identifying URL).
	* ``INFO`` — soft reminder that something MUST happen later (camera-ready
	sections, lay summaries, font requirements, page-limit
	estimation that the .tex source can't actually verify).
	"""
	from __future__ import annotations

	import re
	from typing import List, Optional

	from .base import BaseChecker, CheckResult, CheckSeverity


	# ------------------------------------------------------------------ helpers ---

	# Match \section{X}, \subsection{X}, \paragraph{X}, optionally starred,
	# allowing an optional [short] argument before the {body}.
	def _section_pattern(name: str) -> re.Pattern:
	return re.compile(
	r'\\(?:section\|subsection\|paragraph)\?\s(?:\[[^\]]\])?\s\{[^}]*?'
	+ re.escape(name) + r'[^}]*\}',
	re.IGNORECASE,
	)


	# Domains/URL patterns that strongly de-anonymize an author. Whitelisted
	# domains (which legitimately appear in CV/ML papers without leaking identity)
	# are excluded.
	_IDENTIFYING_URL_PATTERNS = [
	re.compile(r'\bgithub\.com/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
	re.compile(r'\b[A-Za-z0-9_\-]+\.github\.io\b', re.IGNORECASE),
	re.compile(r'\bgitlab\.com/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
	re.compile(r'\bbitbucket\.org/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
	re.compile(r'\b(?:huggingface\.co\|wandb\.ai)/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
	re.compile(r'\b(?:linkedin\|twitter\|x)\.com/[A-Za-z0-9_\-]+', re.IGNORECASE),
	]

	# URLs that are explicitly anonymous-friendly and should NOT be flagged.
	_ANONYMOUS_URL_HINTS = re.compile(
	r'(anonymous\|anon\|blind\|review\|submission\|4open\.science)', re.IGNORECASE,
	)

	# Capture URLs from \url{...}, \href{...}{...}, and bare http(s)://...
	_URL_FROM_TEX = re.compile(
	r'\\(?:url\|href)\s*\{([^}]+)\}\|(?<![/\w])(https?://[^\s,)\\]+)',
	)

	# Acknowledgments macros / sections used by various templates.
	_ACK_PATTERNS = [
	re.compile(r'\\section\?\s\{\sAcknowledg\w\s*\}', re.IGNORECASE),
	re.compile(r'\\acknowledgments?\s*\{', re.IGNORECASE),
	re.compile(r'\\begin\{acks\}', re.IGNORECASE),
	]

	# NeurIPS Paper Checklist markers — the official template either calls
	# \input{neurips_paper_checklist} or includes a \section*{NeurIPS Paper Checklist}.
	_NEURIPS_CHECKLIST_PATTERNS = [
	re.compile(r'\\section\?\s\{[^}]Paper\s+Checklist[^}]\}', re.IGNORECASE),
	re.compile(r'\\input\{[^}]paper[_\-]?checklist[^}]\}', re.IGNORECASE),
	re.compile(r'\\input\{[^}]neurips[_\-]?\d{0,4}[_\-]?checklist[^}]\}', re.IGNORECASE),
	re.compile(r'\\paperchecklist\b', re.IGNORECASE),
	]

	# Reproducibility Statement (ICLR / NeurIPS).
	_REPRO_SECTION = re.compile(
	r'\\section\?\s\{[^}]Reproducibility[^}]\}', re.IGNORECASE,
	)

	# Document-class options carry the paper size.
	_DOCCLASS_RE = re.compile(
	r'\\documentclass\s(?:\[([^\]])\])?\s*\{([^}]+)\}'
	)

	# A very rough regex for figures/tables INSIDE the Limitations section
	# (used to enforce ACL "discussion only" rule).
	_FLOAT_OR_NEW_SECTION_RE = re.compile(
	r'\\begin\{(?:table\|figure\|algorithm)\?\}\|\\section\?\s*\{', re.IGNORECASE,
	)


	# ----------------------------------------------------------------- checker ---

	class TemplateChecker(BaseChecker):
	name = "template"
	display_name = "Conference Template"
	description = "Verify per-venue submission rules (sections, style, anonymity, deliverables)"

	def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
	config = config or {}
	template = config.get("template")
	if template is None:
	return []

	content = self._remove_comments(tex_content)
	results: List[CheckResult] = []

	self._check_mandatory_sections(template, content, results)
	self._check_camera_only_sections(template, content, results)
	self._check_style_package(template, content, results)
	self._check_doc_class(template, content, results)
	self._check_paper_size(template, content, results)

	if template.double_blind:
	self._check_double_blind_author(template, content, results)
	if template.forbid_identifying_urls:
	self._check_identifying_urls(template, content, results)
	if template.forbid_acks_in_review:
	self._check_acknowledgments(template, content, results)

	if template.requires_paper_checklist:
	self._check_paper_checklist(template, content, results)
	if template.requires_reproducibility_statement:
	self._check_reproducibility_statement(template, content, results)
	if template.requires_lay_summary_camera:
	self._inform_lay_summary(template, results)
	if template.requires_type1_fonts:
	self._inform_type1_fonts(template, results)
	if template.min_main_pages > 0:
	self._inform_min_pages(template, results)

	if "Limitations" in template.mandatory_sections:
	self._check_limitations_content(template, content, results)

	return results

	# ============================================================== sections ==

	def _check_mandatory_sections(self, template, content: str, results: List[CheckResult]):
	for section in template.mandatory_sections or []:
	if not _section_pattern(section).search(content):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.ERROR,
	message=f"[{template.name}] Missing mandatory section: '{section}'",
	suggestion=f"Add `\\section{{{section}}}` (required by {template.name}).",
	))

	def _check_camera_only_sections(self, template, content: str, results: List[CheckResult]):
	for section in template.mandatory_camera_sections or []:
	if not _section_pattern(section).search(content):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.INFO,
	message=(
	f"[{template.name}] Camera-ready section '{section}' not found. "
	"Required for the camera-ready version, optional for review."
	),
	suggestion=f"Add `\\section{{{section}}}` before References for camera-ready.",
	))

	# =================================================== style / typesetting ==

	def _check_style_package(self, template, content: str, results: List[CheckResult]):
	pkg = (template.style_package or "").strip()
	if not pkg:
	return
	pkg_re = re.compile(
	r'\\(?:usepackage\|documentclass)(?:\[[^\]]\])?\s\{\s*'
	+ re.escape(pkg) + r'\s*\}'
	)
	if not pkg_re.search(content):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=(
	f"[{template.name}] Style package '{pkg}' not found. "
	"If you really are submitting to this venue, your template may be wrong."
	),
	suggestion=f"Use the official `{pkg}` style package.",
	))

	def _check_doc_class(self, template, content: str, results: List[CheckResult]):
	wanted = (template.doc_class or "").strip()
	if not wanted:
	return
	m = _DOCCLASS_RE.search(content)
	actual = m.group(2).strip() if m else ""
	if actual.lower() != wanted.lower():
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=(
	f"[{template.name}] Expected `\\documentclass{{{wanted}}}`, "
	f"found `{actual or 'none'}`."
	),
	suggestion=f"Use the official document class `{wanted}` (Springer LNCS for ECCV).",
	))

	def _check_paper_size(self, template, content: str, results: List[CheckResult]):
	wanted = (template.paper_size or "").lower()
	if wanted not in {"letter", "a4"}:
	return
	m = _DOCCLASS_RE.search(content)
	if not m:
	return
	opts = (m.group(1) or "").lower()
	actual = None
	if "letterpaper" in opts or "letter" in opts:
	actual = "letter"
	elif "a4paper" in opts or "a4" in opts:
	actual = "a4"
	if actual and actual != wanted:
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=(
	f"[{template.name}] Expected paper size '{wanted}', "
	f"document class is set to '{actual}'."
	),
	suggestion=f"Use `\\documentclass[{wanted}paper]{{...}}`.",
	))

	# ================================================================ blinding =

	def _check_double_blind_author(self, template, content: str, results: List[CheckResult]):
	m = re.search(r'\\author\s(?:\[[^\]]\])?\s\{([^}])\}', content)
	if not m:
	return
	body = m.group(1)
	if not body.strip():
	return
	if re.search(r'(anonymous\|hidden\|blind\|submission)', body, re.IGNORECASE):
	return
	line_num = self._find_line_number(content, m.start())
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.ERROR,
	message=f"[{template.name}] Double-blind: \\author appears to contain identifying info",
	line_number=line_num,
	line_content=body.strip(),
	suggestion=r"Replace \author with anonymous placeholder during review.",
	))

	def _check_identifying_urls(self, template, content: str, results: List[CheckResult]):
	for m in _URL_FROM_TEX.finditer(content):
	url = (m.group(1) or m.group(2) or "").strip()
	if not url:
	continue
	if _ANONYMOUS_URL_HINTS.search(url):
	continue
	for pat in _IDENTIFYING_URL_PATTERNS:
	if pat.search(url):
	line_num = self._find_line_number(content, m.start())
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=(
	f"[{template.name}] Possible identifying URL during double-blind review: "
	f"{url[:120]}"
	),
	line_number=line_num,
	line_content=url,
	suggestion=(
	"Use Anonymous GitHub (https://anonymous.4open.science) or remove "
	"the link until the camera-ready version."
	),
	))
	break # one finding per URL

	def _check_acknowledgments(self, template, content: str, results: List[CheckResult]):
	for pat in _ACK_PATTERNS:
	m = pat.search(content)
	if m:
	line_num = self._find_line_number(content, m.start())
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=(
	f"[{template.name}] Acknowledgments section detected; "
	f"{template.short_name.upper()} requires omitting it during review."
	),
	line_number=line_num,
	suggestion=(
	"Comment out or wrap acks in `\\if<reviewmode>...\\fi` so they only "
	"appear in the camera-ready version."
	),
	))
	return # one finding is enough

	# ============================================== per-venue special items ===

	def _check_paper_checklist(self, template, content: str, results: List[CheckResult]):
	for pat in _NEURIPS_CHECKLIST_PATTERNS:
	if pat.search(content):
	return
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.ERROR,
	message=(
	f"[{template.name}] NeurIPS Paper Checklist not found. "
	"NeurIPS desk-rejects submissions without the checklist."
	),
	suggestion=(
	"Add `\\input{neurips_paper_checklist}` (or paste the official template) "
	"after References / supplementary."
	),
	))

	def _check_reproducibility_statement(self, template, content: str, results: List[CheckResult]):
	if _REPRO_SECTION.search(content):
	return
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.INFO,
	message=(
	f"[{template.name}] Reproducibility Statement not found. "
	"It's encouraged (~1 page) and does not count toward the page limit."
	),
	suggestion=(
	"Add `\\section*{Reproducibility Statement}` before References summarizing "
	"code/data/seeds/hyperparameter availability."
	),
	))

	def _inform_lay_summary(self, template, results: List[CheckResult]):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.INFO,
	message=(
	f"[{template.name}] Lay summary required at camera-ready time "
	"(plain-language summary submitted via OpenReview)."
	),
	suggestion="Draft a 1–2 paragraph plain-language summary now to avoid a last-minute scramble.",
	))

	def _inform_type1_fonts(self, template, results: List[CheckResult]):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.INFO,
	message=(
	f"[{template.name}] Embedded fonts must be Type-1 only — verify with "
	"`pdffonts <paper.pdf>`. Cannot be checked from .tex source alone."
	),
	suggestion="Compile with `pdflatex` (not XeLaTeX/LuaLaTeX) and convert any Type-3 fonts.",
	))

	def _inform_min_pages(self, template, results: List[CheckResult]):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.INFO,
	message=(
	f"[{template.name}] Main text must be at least {template.min_main_pages} pages "
	f"and at most {template.page_limit_review} pages. Cannot be measured from source."
	),
	suggestion=(
	f"Compile and confirm the rendered PDF stays within "
	f"{template.min_main_pages}–{template.page_limit_review} pages of main text."
	),
	))

	# ============================================ ACL family: Limitations rule

	def _check_limitations_content(self, template, content: str, results: List[CheckResult]):
	# Find the Limitations section span up to the next \section or end of doc.
	m = re.search(
	r'(\\section\?\s(?:\[[^\]]\])?\s\{[^}]Limitations[^}]\})',
	content, re.IGNORECASE,
	)
	if not m:
	return # mandatory_sections check already flagged absence
	start = m.end()
	nxt = re.search(r'\\section\?\s\{', content[start:], re.IGNORECASE)
	end = start + nxt.start() if nxt else len(content)
	section_body = content[start:end]
	# Discussion-only rule: no floats, no nested \section
	if _FLOAT_OR_NEW_SECTION_RE.search(section_body):
	line_num = self._find_line_number(content, start)
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=(
	f"[{template.name}] Limitations section appears to contain floats or a "
	"nested section. ACL/EMNLP/NAACL require Limitations to be discussion only."
	),
	line_number=line_num,
	suggestion=(
	"Move tables/figures/algorithms out of Limitations into the main body or "
	"appendix; Limitations should be prose-only."
	),
	))