Spaces:

thinkwee
/

BibGuard

Running

BibGuard / src /checkers /reference_checker.py

thinkwee

update reference checker

79d7264 4 days ago

8.04 kB

	"""
	Cross-reference checker.

	Validates that:
	- All figures and tables are referenced in text
	- All labels have corresponding references
	- Appendix sections are referenced in main text
	"""
	import re
	from typing import List, Set, Tuple

	from .base import BaseChecker, CheckResult, CheckSeverity


	class ReferenceChecker(BaseChecker):
	"""Check cross-reference integrity in the document."""

	name = "reference"
	display_name = "Cross-References"
	description = "Verify all figures, tables, and sections are properly referenced"

	# Label pattern: \label{prefix:name}
	LABEL_PATTERN = re.compile(r'\\label\{([^}]+)\}')

	# Reference patterns
	REF_PATTERNS = [
	re.compile(r'\\ref\{([^}]+)\}'),
	re.compile(r'\\autoref\{([^}]+)\}'),
	re.compile(r'\\cref\{([^}]+)\}'),
	re.compile(r'\\Cref\{([^}]+)\}'),
	re.compile(r'\\eqref\{([^}]+)\}'),
	re.compile(r'\\pageref\{([^}]+)\}'),
	re.compile(r'\\nameref\{([^}]+)\}'),
	re.compile(r'\\Sref\{([^}]+)\}'),
	]

	# Appendix detection
	APPENDIX_START_PATTERN = re.compile(r'\\appendix\b\|\\begin\{appendix\}')
	SECTION_PATTERN = re.compile(r'\\section\*?\{([^}]+)\}')

	def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
	results = []

	# Extract all labels and their positions
	labels = self._extract_labels(tex_content)

	# Extract all references
	references = self._extract_references(tex_content)

	# Find unreferenced labels
	for label, (line_num, line_content) in labels.items():
	if label not in references:
	# Determine severity based on label type
	severity = self._get_severity_for_label(label)
	label_type = self._get_label_type(label)

	results.append(self._create_result(
	passed=False,
	severity=severity,
	message=f"Unreferenced {label_type}: '{label}'",
	line_number=line_num,
	line_content=line_content,
	suggestion=f"Add \\ref{{{label}}} or \\autoref{{{label}}} where appropriate"
	))

	# Find undefined references (refs without labels)
	for ref, (line_num, line_content) in references.items():
	if ref not in labels:
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.ERROR,
	message=f"Reference to undefined label: '{ref}'",
	line_number=line_num,
	line_content=line_content,
	suggestion=f"Add \\label{{{ref}}} to the target element or fix the reference"
	))

	# Check appendix sections
	appendix_results = self._check_appendix_references(tex_content, labels, references)
	results.extend(appendix_results)

	return results

	def _extract_labels(self, content: str) -> dict:
	"""Extract all labels with their line numbers."""
	labels = {}
	for match in self.LABEL_PATTERN.finditer(content):
	if not self._is_commented(content, match.start()):
	label = match.group(1)
	line_num = self._find_line_number(content, match.start())
	line_content = self._get_line_content(content, line_num)
	labels[label] = (line_num, line_content)
	return labels

	def _extract_references(self, content: str) -> dict:
	"""Extract all references with their line numbers."""
	references = {}
	for pattern in self.REF_PATTERNS:
	for match in pattern.finditer(content):
	if not self._is_commented(content, match.start()):
	# Handle comma-separated refs like \ref{fig:a,fig:b}
	refs_str = match.group(1)
	for ref in refs_str.split(','):
	ref = ref.strip()
	if ref and ref not in references:
	# Skip if ref looks like command parameter (#1, #2)
	if ref.startswith('#') and len(ref) == 2 and ref[1].isdigit():
	continue

	# Skip if inside \newcommand or \renewcommand definition
	line_num = self._find_line_number(content, match.start())
	line_content = self._get_line_content(content, line_num)
	if re.search(r'\\(new\|renew\|provide)command', line_content):
	continue

	references[ref] = (line_num, line_content)
	return references

	def _get_label_type(self, label: str) -> str:
	"""Determine the type of a label based on its prefix."""
	if ':' in label:
	prefix = label.split(':')[0].lower()
	type_map = {
	'fig': 'figure',
	'tab': 'table',
	'sec': 'section',
	'eq': 'equation',
	'alg': 'algorithm',
	'lst': 'listing',
	'app': 'appendix',
	}
	return type_map.get(prefix, 'label')
	return 'label'

	def _get_severity_for_label(self, label: str) -> CheckSeverity:
	"""Determine severity based on label type."""
	label_type = self._get_label_type(label)

	# Figures and tables should always be referenced
	if label_type in ('figure', 'table'):
	return CheckSeverity.WARNING

	# Equations might not always need explicit reference
	if label_type == 'equation':
	return CheckSeverity.INFO

	return CheckSeverity.WARNING

	def _check_appendix_references(
	self,
	content: str,
	labels: dict,
	references: dict
	) -> List[CheckResult]:
	"""Check that appendix sections are referenced in main text."""
	results = []

	# Find where appendix starts
	appendix_match = self.APPENDIX_START_PATTERN.search(content)
	if not appendix_match:
	return results

	appendix_start = appendix_match.start()
	main_content = content[:appendix_start]
	appendix_content = content[appendix_start:]

	# Find section labels in appendix
	for match in self.LABEL_PATTERN.finditer(appendix_content):
	if self._is_commented(appendix_content, match.start()):
	continue

	label = match.group(1)

	# Check if this label is for a section
	if 'sec' in label.lower() or 'app' in label.lower():
	# Check if referenced in main text (before appendix)
	is_referenced = False
	for pattern in self.REF_PATTERNS:
	if pattern.search(main_content) and label in main_content:
	for m in pattern.finditer(main_content):
	if label in m.group(1):
	is_referenced = True
	break
	if is_referenced:
	break

	if not is_referenced:
	line_num = self._find_line_number(content, appendix_start + match.start())
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=f"Appendix section '{label}' is not referenced in main text",
	line_number=line_num,
	suggestion="Add a reference to this appendix section in the main text"
	))

	return results