File size: 8,823 Bytes
46df5f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
"""
Anonymization checker.
For double-blind review submissions, checks for:
- Author name leaks in acknowledgments
- Personal URLs (GitHub, personal pages)
- Self-citations that reveal identity
- Institutional information in comments
"""
import re
from typing import List
from .base import BaseChecker, CheckResult, CheckSeverity
class AnonymizationChecker(BaseChecker):
"""Check for anonymization issues in double-blind submissions."""
name = "anonymization"
display_name = "Anonymization"
description = "Detect potential identity leaks in double-blind submissions"
# Patterns for identity-revealing content
PERSONAL_URL_PATTERNS = [
(r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"),
(r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"),
(r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"),
(r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"),
(r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"),
(r'~[a-zA-Z]+/', "Personal university page"),
(r'people\.[a-zA-Z]+\.edu', "Academic personal page"),
(r'homes\.[a-zA-Z]+\.(edu|ac\.[a-z]+)', "Academic home page"),
]
# Anonymous submission indicators (should be present)
ANONYMOUS_MARKERS = [
r'\\author\{[^}]*anonymous[^}]*\}',
r'anonymous\s+submission',
r'\\runningauthor\{[^}]*\}', # Should be empty or generic
]
# Potentially revealing patterns
SELF_CITE_PATTERNS = [
r'\\cite[pt]?\{[^}]*\}\s*(?:show|demonstrate|propose|present|introduce)',
r'(?:our|we)\s+(?:previous|prior|earlier)\s+(?:work|paper|study)',
r'(?:as\s+)?(?:we|the\s+authors?)\s+(?:have\s+)?(?:shown|demonstrated|proved)',
]
# Acknowledgment patterns
ACK_PATTERN = re.compile(
r'\\(?:section\*?\{acknowledgment|begin\{ack)',
re.IGNORECASE
)
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
results = []
lines = tex_content.split('\n')
# Check if this is a review submission (look for anonymous author)
is_review_version = self._is_review_version(tex_content)
if not is_review_version:
# If camera-ready, skip anonymization checks
results.append(self._create_result(
passed=True,
severity=CheckSeverity.INFO,
message="Document appears to be camera-ready version (not checking anonymization)"
))
return results
# Check for personal URLs
for line_num, line in enumerate(lines, 1):
# Skip comments, but still check for leaks in comments!
if self._is_comment_line(line):
for pattern, desc in self.PERSONAL_URL_PATTERNS:
if re.search(pattern, line, re.IGNORECASE):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message=f"{desc} in comment (could be revealed when compiling)",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Remove or anonymize URL even in comments"
))
continue
for pattern, desc in self.PERSONAL_URL_PATTERNS:
if re.search(pattern, line, re.IGNORECASE):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.ERROR,
message=f"{desc} may reveal author identity",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Replace with anonymized URL or remove for review"
))
# Check acknowledgments section
ack_results = self._check_acknowledgments(tex_content, lines)
results.extend(ack_results)
# Check for self-revealing citations
for line_num, line in enumerate(lines, 1):
# Skip comments using base class method
if self._is_comment_line(line):
continue
for pattern in self.SELF_CITE_PATTERNS:
if re.search(pattern, line, re.IGNORECASE):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message="Potentially self-revealing citation pattern",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
))
# Check for \author content
author_results = self._check_author_field(tex_content)
results.extend(author_results)
return results
def _is_review_version(self, content: str) -> bool:
"""Detect if this is a review (anonymous) version."""
# Check for common anonymous submission markers
review_indicators = [
r'review',
r'submitted\s+to',
r'under\s+review',
r'anonymous',
r'\\usepackage\[review\]',
]
for indicator in review_indicators:
if re.search(indicator, content[:2000], re.IGNORECASE):
return True
# Check for camera-ready indicators (negative)
camera_indicators = [
r'\\usepackage\[accepted\]',
r'\\usepackage\[final\]',
r'camera[\s-]?ready',
]
for indicator in camera_indicators:
if re.search(indicator, content[:2000], re.IGNORECASE):
return False
# Default to review version (safer)
return True
def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]:
"""Check acknowledgments section for identity leaks."""
results = []
# Find acknowledgment section
ack_match = self.ACK_PATTERN.search(content)
if not ack_match:
return results
# Find the line number
ack_line = self._find_line_number(content, ack_match.start())
# Check if it's commented out
actual_line = lines[ack_line - 1] if ack_line <= len(lines) else ""
if not actual_line.lstrip().startswith('%'):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message="Acknowledgments section found - should be commented out for review",
line_number=ack_line,
suggestion="Comment out acknowledgments with % for anonymous submission"
))
return results
def _check_author_field(self, content: str) -> List[CheckResult]:
"""Check \\author{} field for revealing content."""
results = []
# Find \author{...} - handle multiline
author_pattern = re.compile(r'\\author\s*\{', re.DOTALL)
match = author_pattern.search(content)
if match:
# Extract author content (handle nested braces)
start = match.end()
brace_count = 1
i = start
while i < len(content) and brace_count > 0:
if content[i] == '{':
brace_count += 1
elif content[i] == '}':
brace_count -= 1
i += 1
author_content = content[start:i-1]
line_num = self._find_line_number(content, match.start())
# Check if author content looks anonymous
if not re.search(r'anonymous|author\s*names?\s*hidden', author_content, re.IGNORECASE):
# Check if it's not using \Anonymous or similar
if not re.search(r'\\(Anonymous|blindauthor)', author_content):
# Might contain real author info
if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.ERROR,
message="Author field may contain real names",
line_number=line_num,
suggestion="Replace with 'Anonymous' or use anonymization command"
))
return results
|