Spaces:

thinkwee
/

BibGuard

Running

BibGuard / src /checkers /anonymization_checker.py

thinkwee

init

46df5f0 13 days ago

8.82 kB

	"""
	Anonymization checker.

	For double-blind review submissions, checks for:
	- Author name leaks in acknowledgments
	- Personal URLs (GitHub, personal pages)
	- Self-citations that reveal identity
	- Institutional information in comments
	"""
	import re
	from typing import List

	from .base import BaseChecker, CheckResult, CheckSeverity


	class AnonymizationChecker(BaseChecker):
	"""Check for anonymization issues in double-blind submissions."""

	name = "anonymization"
	display_name = "Anonymization"
	description = "Detect potential identity leaks in double-blind submissions"

	# Patterns for identity-revealing content
	PERSONAL_URL_PATTERNS = [
	(r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"),
	(r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"),
	(r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"),
	(r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"),
	(r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"),
	(r'~[a-zA-Z]+/', "Personal university page"),
	(r'people\.[a-zA-Z]+\.edu', "Academic personal page"),
	(r'homes\.[a-zA-Z]+\.(edu\|ac\.[a-z]+)', "Academic home page"),
	]

	# Anonymous submission indicators (should be present)
	ANONYMOUS_MARKERS = [
	r'\\author\{[^}]anonymous[^}]\}',
	r'anonymous\s+submission',
	r'\\runningauthor\{[^}]*\}', # Should be empty or generic
	]

	# Potentially revealing patterns
	SELF_CITE_PATTERNS = [
	r'\\cite[pt]?\{[^}]\}\s(?:show\|demonstrate\|propose\|present\|introduce)',
	r'(?:our\|we)\s+(?:previous\|prior\|earlier)\s+(?:work\|paper\|study)',
	r'(?:as\s+)?(?:we\|the\s+authors?)\s+(?:have\s+)?(?:shown\|demonstrated\|proved)',
	]

	# Acknowledgment patterns
	ACK_PATTERN = re.compile(
	r'\\(?:section\*?\{acknowledgment\|begin\{ack)',
	re.IGNORECASE
	)

	def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
	results = []
	lines = tex_content.split('\n')

	# Check if this is a review submission (look for anonymous author)
	is_review_version = self._is_review_version(tex_content)

	if not is_review_version:
	# If camera-ready, skip anonymization checks
	results.append(self._create_result(
	passed=True,
	severity=CheckSeverity.INFO,
	message="Document appears to be camera-ready version (not checking anonymization)"
	))
	return results

	# Check for personal URLs
	for line_num, line in enumerate(lines, 1):
	# Skip comments, but still check for leaks in comments!
	if self._is_comment_line(line):
	for pattern, desc in self.PERSONAL_URL_PATTERNS:
	if re.search(pattern, line, re.IGNORECASE):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=f"{desc} in comment (could be revealed when compiling)",
	line_number=line_num,
	line_content=line.strip()[:100],
	suggestion="Remove or anonymize URL even in comments"
	))
	continue

	for pattern, desc in self.PERSONAL_URL_PATTERNS:
	if re.search(pattern, line, re.IGNORECASE):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.ERROR,
	message=f"{desc} may reveal author identity",
	line_number=line_num,
	line_content=line.strip()[:100],
	suggestion="Replace with anonymized URL or remove for review"
	))

	# Check acknowledgments section
	ack_results = self._check_acknowledgments(tex_content, lines)
	results.extend(ack_results)

	# Check for self-revealing citations
	for line_num, line in enumerate(lines, 1):
	# Skip comments using base class method
	if self._is_comment_line(line):
	continue

	for pattern in self.SELF_CITE_PATTERNS:
	if re.search(pattern, line, re.IGNORECASE):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message="Potentially self-revealing citation pattern",
	line_number=line_num,
	line_content=line.strip()[:100],
	suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
	))

	# Check for \author content
	author_results = self._check_author_field(tex_content)
	results.extend(author_results)

	return results

	def _is_review_version(self, content: str) -> bool:
	"""Detect if this is a review (anonymous) version."""
	# Check for common anonymous submission markers
	review_indicators = [
	r'review',
	r'submitted\s+to',
	r'under\s+review',
	r'anonymous',
	r'\\usepackage\[review\]',
	]

	for indicator in review_indicators:
	if re.search(indicator, content[:2000], re.IGNORECASE):
	return True

	# Check for camera-ready indicators (negative)
	camera_indicators = [
	r'\\usepackage\[accepted\]',
	r'\\usepackage\[final\]',
	r'camera[\s-]?ready',
	]

	for indicator in camera_indicators:
	if re.search(indicator, content[:2000], re.IGNORECASE):
	return False

	# Default to review version (safer)
	return True

	def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]:
	"""Check acknowledgments section for identity leaks."""
	results = []

	# Find acknowledgment section
	ack_match = self.ACK_PATTERN.search(content)
	if not ack_match:
	return results

	# Find the line number
	ack_line = self._find_line_number(content, ack_match.start())

	# Check if it's commented out
	actual_line = lines[ack_line - 1] if ack_line <= len(lines) else ""
	if not actual_line.lstrip().startswith('%'):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message="Acknowledgments section found - should be commented out for review",
	line_number=ack_line,
	suggestion="Comment out acknowledgments with % for anonymous submission"
	))

	return results

	def _check_author_field(self, content: str) -> List[CheckResult]:
	"""Check \\author{} field for revealing content."""
	results = []

	# Find \author{...} - handle multiline
	author_pattern = re.compile(r'\\author\s*\{', re.DOTALL)
	match = author_pattern.search(content)

	if match:
	# Extract author content (handle nested braces)
	start = match.end()
	brace_count = 1
	i = start
	while i < len(content) and brace_count > 0:
	if content[i] == '{':
	brace_count += 1
	elif content[i] == '}':
	brace_count -= 1
	i += 1

	author_content = content[start:i-1]
	line_num = self._find_line_number(content, match.start())

	# Check if author content looks anonymous
	if not re.search(r'anonymous\|author\snames?\shidden', author_content, re.IGNORECASE):
	# Check if it's not using \Anonymous or similar
	if not re.search(r'\\(Anonymous\|blindauthor)', author_content):
	# Might contain real author info
	if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.ERROR,
	message="Author field may contain real names",
	line_number=line_num,
	suggestion="Replace with 'Anonymous' or use anonymization command"
	))

	return results