File size: 8,823 Bytes
46df5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
"""
Anonymization checker.

For double-blind review submissions, checks for:
- Author name leaks in acknowledgments
- Personal URLs (GitHub, personal pages)
- Self-citations that reveal identity
- Institutional information in comments
"""
import re
from typing import List

from .base import BaseChecker, CheckResult, CheckSeverity


class AnonymizationChecker(BaseChecker):
    """Check for anonymization issues in double-blind submissions."""
    
    name = "anonymization"
    display_name = "Anonymization"
    description = "Detect potential identity leaks in double-blind submissions"
    
    # Patterns for identity-revealing content
    PERSONAL_URL_PATTERNS = [
        (r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"),
        (r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"),
        (r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"),
        (r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"),
        (r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"),
        (r'~[a-zA-Z]+/', "Personal university page"),
        (r'people\.[a-zA-Z]+\.edu', "Academic personal page"),
        (r'homes\.[a-zA-Z]+\.(edu|ac\.[a-z]+)', "Academic home page"),
    ]
    
    # Anonymous submission indicators (should be present)
    ANONYMOUS_MARKERS = [
        r'\\author\{[^}]*anonymous[^}]*\}',
        r'anonymous\s+submission',
        r'\\runningauthor\{[^}]*\}',  # Should be empty or generic
    ]
    
    # Potentially revealing patterns
    SELF_CITE_PATTERNS = [
        r'\\cite[pt]?\{[^}]*\}\s*(?:show|demonstrate|propose|present|introduce)',
        r'(?:our|we)\s+(?:previous|prior|earlier)\s+(?:work|paper|study)',
        r'(?:as\s+)?(?:we|the\s+authors?)\s+(?:have\s+)?(?:shown|demonstrated|proved)',
    ]
    
    # Acknowledgment patterns
    ACK_PATTERN = re.compile(
        r'\\(?:section\*?\{acknowledgment|begin\{ack)',
        re.IGNORECASE
    )
    
    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
        results = []
        lines = tex_content.split('\n')
        
        # Check if this is a review submission (look for anonymous author)
        is_review_version = self._is_review_version(tex_content)
        
        if not is_review_version:
            # If camera-ready, skip anonymization checks
            results.append(self._create_result(
                passed=True,
                severity=CheckSeverity.INFO,
                message="Document appears to be camera-ready version (not checking anonymization)"
            ))
            return results
        
        # Check for personal URLs
        for line_num, line in enumerate(lines, 1):
            # Skip comments, but still check for leaks in comments!
            if self._is_comment_line(line):
                for pattern, desc in self.PERSONAL_URL_PATTERNS:
                    if re.search(pattern, line, re.IGNORECASE):
                        results.append(self._create_result(
                            passed=False,
                            severity=CheckSeverity.WARNING,
                            message=f"{desc} in comment (could be revealed when compiling)",
                            line_number=line_num,
                            line_content=line.strip()[:100],
                            suggestion="Remove or anonymize URL even in comments"
                        ))
                continue
            
            for pattern, desc in self.PERSONAL_URL_PATTERNS:
                if re.search(pattern, line, re.IGNORECASE):
                    results.append(self._create_result(
                        passed=False,
                        severity=CheckSeverity.ERROR,
                        message=f"{desc} may reveal author identity",
                        line_number=line_num,
                        line_content=line.strip()[:100],
                        suggestion="Replace with anonymized URL or remove for review"
                    ))
        
        # Check acknowledgments section
        ack_results = self._check_acknowledgments(tex_content, lines)
        results.extend(ack_results)
        
        # Check for self-revealing citations
        for line_num, line in enumerate(lines, 1):
            # Skip comments using base class method
            if self._is_comment_line(line):
                continue
            
            for pattern in self.SELF_CITE_PATTERNS:
                if re.search(pattern, line, re.IGNORECASE):
                    results.append(self._create_result(
                        passed=False,
                        severity=CheckSeverity.WARNING,
                        message="Potentially self-revealing citation pattern",
                        line_number=line_num,
                        line_content=line.strip()[:100],
                        suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
                    ))
        
        # Check for \author content
        author_results = self._check_author_field(tex_content)
        results.extend(author_results)
        
        return results
    
    def _is_review_version(self, content: str) -> bool:
        """Detect if this is a review (anonymous) version."""
        # Check for common anonymous submission markers
        review_indicators = [
            r'review',
            r'submitted\s+to',
            r'under\s+review',
            r'anonymous',
            r'\\usepackage\[review\]',
        ]
        
        for indicator in review_indicators:
            if re.search(indicator, content[:2000], re.IGNORECASE):
                return True
        
        # Check for camera-ready indicators (negative)
        camera_indicators = [
            r'\\usepackage\[accepted\]',
            r'\\usepackage\[final\]',
            r'camera[\s-]?ready',
        ]
        
        for indicator in camera_indicators:
            if re.search(indicator, content[:2000], re.IGNORECASE):
                return False
        
        # Default to review version (safer)
        return True
    
    def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]:
        """Check acknowledgments section for identity leaks."""
        results = []
        
        # Find acknowledgment section
        ack_match = self.ACK_PATTERN.search(content)
        if not ack_match:
            return results
        
        # Find the line number
        ack_line = self._find_line_number(content, ack_match.start())
        
        # Check if it's commented out
        actual_line = lines[ack_line - 1] if ack_line <= len(lines) else ""
        if not actual_line.lstrip().startswith('%'):
            results.append(self._create_result(
                passed=False,
                severity=CheckSeverity.WARNING,
                message="Acknowledgments section found - should be commented out for review",
                line_number=ack_line,
                suggestion="Comment out acknowledgments with % for anonymous submission"
            ))
        
        return results
    
    def _check_author_field(self, content: str) -> List[CheckResult]:
        """Check \\author{} field for revealing content."""
        results = []
        
        # Find \author{...} - handle multiline
        author_pattern = re.compile(r'\\author\s*\{', re.DOTALL)
        match = author_pattern.search(content)
        
        if match:
            # Extract author content (handle nested braces)
            start = match.end()
            brace_count = 1
            i = start
            while i < len(content) and brace_count > 0:
                if content[i] == '{':
                    brace_count += 1
                elif content[i] == '}':
                    brace_count -= 1
                i += 1
            
            author_content = content[start:i-1]
            line_num = self._find_line_number(content, match.start())
            
            # Check if author content looks anonymous
            if not re.search(r'anonymous|author\s*names?\s*hidden', author_content, re.IGNORECASE):
                # Check if it's not using \Anonymous or similar
                if not re.search(r'\\(Anonymous|blindauthor)', author_content):
                    # Might contain real author info
                    if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content):
                        results.append(self._create_result(
                            passed=False,
                            severity=CheckSeverity.ERROR,
                            message="Author field may contain real names",
                            line_number=line_num,
                            suggestion="Replace with 'Anonymous' or use anonymization command"
                        ))
        
        return results