|
|
""" |
|
|
Anonymization checker. |
|
|
|
|
|
For double-blind review submissions, checks for: |
|
|
- Author name leaks in acknowledgments |
|
|
- Personal URLs (GitHub, personal pages) |
|
|
- Self-citations that reveal identity |
|
|
- Institutional information in comments |
|
|
""" |
|
|
import re |
|
|
from typing import List |
|
|
|
|
|
from .base import BaseChecker, CheckResult, CheckSeverity |
|
|
|
|
|
|
|
|
class AnonymizationChecker(BaseChecker): |
|
|
"""Check for anonymization issues in double-blind submissions.""" |
|
|
|
|
|
name = "anonymization" |
|
|
display_name = "Anonymization" |
|
|
description = "Detect potential identity leaks in double-blind submissions" |
|
|
|
|
|
|
|
|
PERSONAL_URL_PATTERNS = [ |
|
|
(r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"), |
|
|
(r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"), |
|
|
(r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"), |
|
|
(r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"), |
|
|
(r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"), |
|
|
(r'~[a-zA-Z]+/', "Personal university page"), |
|
|
(r'people\.[a-zA-Z]+\.edu', "Academic personal page"), |
|
|
(r'homes\.[a-zA-Z]+\.(edu|ac\.[a-z]+)', "Academic home page"), |
|
|
] |
|
|
|
|
|
|
|
|
ANONYMOUS_MARKERS = [ |
|
|
r'\\author\{[^}]*anonymous[^}]*\}', |
|
|
r'anonymous\s+submission', |
|
|
r'\\runningauthor\{[^}]*\}', |
|
|
] |
|
|
|
|
|
|
|
|
SELF_CITE_PATTERNS = [ |
|
|
r'\\cite[pt]?\{[^}]*\}\s*(?:show|demonstrate|propose|present|introduce)', |
|
|
r'(?:our|we)\s+(?:previous|prior|earlier)\s+(?:work|paper|study)', |
|
|
r'(?:as\s+)?(?:we|the\s+authors?)\s+(?:have\s+)?(?:shown|demonstrated|proved)', |
|
|
] |
|
|
|
|
|
|
|
|
ACK_PATTERN = re.compile( |
|
|
r'\\(?:section\*?\{acknowledgment|begin\{ack)', |
|
|
re.IGNORECASE |
|
|
) |
|
|
|
|
|
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: |
|
|
results = [] |
|
|
lines = tex_content.split('\n') |
|
|
|
|
|
|
|
|
is_review_version = self._is_review_version(tex_content) |
|
|
|
|
|
if not is_review_version: |
|
|
|
|
|
results.append(self._create_result( |
|
|
passed=True, |
|
|
severity=CheckSeverity.INFO, |
|
|
message="Document appears to be camera-ready version (not checking anonymization)" |
|
|
)) |
|
|
return results |
|
|
|
|
|
|
|
|
for line_num, line in enumerate(lines, 1): |
|
|
|
|
|
if self._is_comment_line(line): |
|
|
for pattern, desc in self.PERSONAL_URL_PATTERNS: |
|
|
if re.search(pattern, line, re.IGNORECASE): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.WARNING, |
|
|
message=f"{desc} in comment (could be revealed when compiling)", |
|
|
line_number=line_num, |
|
|
line_content=line.strip()[:100], |
|
|
suggestion="Remove or anonymize URL even in comments" |
|
|
)) |
|
|
continue |
|
|
|
|
|
for pattern, desc in self.PERSONAL_URL_PATTERNS: |
|
|
if re.search(pattern, line, re.IGNORECASE): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.ERROR, |
|
|
message=f"{desc} may reveal author identity", |
|
|
line_number=line_num, |
|
|
line_content=line.strip()[:100], |
|
|
suggestion="Replace with anonymized URL or remove for review" |
|
|
)) |
|
|
|
|
|
|
|
|
ack_results = self._check_acknowledgments(tex_content, lines) |
|
|
results.extend(ack_results) |
|
|
|
|
|
|
|
|
for line_num, line in enumerate(lines, 1): |
|
|
|
|
|
if self._is_comment_line(line): |
|
|
continue |
|
|
|
|
|
for pattern in self.SELF_CITE_PATTERNS: |
|
|
if re.search(pattern, line, re.IGNORECASE): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.WARNING, |
|
|
message="Potentially self-revealing citation pattern", |
|
|
line_number=line_num, |
|
|
line_content=line.strip()[:100], |
|
|
suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')" |
|
|
)) |
|
|
|
|
|
|
|
|
author_results = self._check_author_field(tex_content) |
|
|
results.extend(author_results) |
|
|
|
|
|
return results |
|
|
|
|
|
def _is_review_version(self, content: str) -> bool: |
|
|
"""Detect if this is a review (anonymous) version.""" |
|
|
|
|
|
review_indicators = [ |
|
|
r'review', |
|
|
r'submitted\s+to', |
|
|
r'under\s+review', |
|
|
r'anonymous', |
|
|
r'\\usepackage\[review\]', |
|
|
] |
|
|
|
|
|
for indicator in review_indicators: |
|
|
if re.search(indicator, content[:2000], re.IGNORECASE): |
|
|
return True |
|
|
|
|
|
|
|
|
camera_indicators = [ |
|
|
r'\\usepackage\[accepted\]', |
|
|
r'\\usepackage\[final\]', |
|
|
r'camera[\s-]?ready', |
|
|
] |
|
|
|
|
|
for indicator in camera_indicators: |
|
|
if re.search(indicator, content[:2000], re.IGNORECASE): |
|
|
return False |
|
|
|
|
|
|
|
|
return True |
|
|
|
|
|
def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]: |
|
|
"""Check acknowledgments section for identity leaks.""" |
|
|
results = [] |
|
|
|
|
|
|
|
|
ack_match = self.ACK_PATTERN.search(content) |
|
|
if not ack_match: |
|
|
return results |
|
|
|
|
|
|
|
|
ack_line = self._find_line_number(content, ack_match.start()) |
|
|
|
|
|
|
|
|
actual_line = lines[ack_line - 1] if ack_line <= len(lines) else "" |
|
|
if not actual_line.lstrip().startswith('%'): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.WARNING, |
|
|
message="Acknowledgments section found - should be commented out for review", |
|
|
line_number=ack_line, |
|
|
suggestion="Comment out acknowledgments with % for anonymous submission" |
|
|
)) |
|
|
|
|
|
return results |
|
|
|
|
|
def _check_author_field(self, content: str) -> List[CheckResult]: |
|
|
"""Check \\author{} field for revealing content.""" |
|
|
results = [] |
|
|
|
|
|
|
|
|
author_pattern = re.compile(r'\\author\s*\{', re.DOTALL) |
|
|
match = author_pattern.search(content) |
|
|
|
|
|
if match: |
|
|
|
|
|
start = match.end() |
|
|
brace_count = 1 |
|
|
i = start |
|
|
while i < len(content) and brace_count > 0: |
|
|
if content[i] == '{': |
|
|
brace_count += 1 |
|
|
elif content[i] == '}': |
|
|
brace_count -= 1 |
|
|
i += 1 |
|
|
|
|
|
author_content = content[start:i-1] |
|
|
line_num = self._find_line_number(content, match.start()) |
|
|
|
|
|
|
|
|
if not re.search(r'anonymous|author\s*names?\s*hidden', author_content, re.IGNORECASE): |
|
|
|
|
|
if not re.search(r'\\(Anonymous|blindauthor)', author_content): |
|
|
|
|
|
if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.ERROR, |
|
|
message="Author field may contain real names", |
|
|
line_number=line_num, |
|
|
suggestion="Replace with 'Anonymous' or use anonymization command" |
|
|
)) |
|
|
|
|
|
return results |
|
|
|