File size: 8,184 Bytes
46df5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
AI artifacts checker.

Detects leftover text from AI writing assistants that should be removed
before submission, such as:
- Conversational responses ("Sure, here is...")
- Placeholder text
- Markdown formatting artifacts
- Common AI response patterns
"""
import re
from typing import List, Tuple

from .base import BaseChecker, CheckResult, CheckSeverity


class AIArtifactsChecker(BaseChecker):
    """Detect AI-generated text artifacts that should be removed."""
    
    name = "ai_artifacts"
    display_name = "AI Artifacts"
    description = "Detect leftover AI assistant text and placeholders"
    
    # Conversational AI patterns (case insensitive)
    # These are phrases that clearly indicate a dialogue between user and AI assistant
    AI_CONVERSATION_PATTERNS = [
        # Responses to requests
        (r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"),
        (r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"),
        (r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"),
        (r'\bcertainly[!,]\s*here\b', "Conversational AI response"),
        (r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"),
        (r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"),
        
        # Self-identification
        (r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"),
        (r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"),
        (r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"),
        
        # Explanatory transitions typical of chat
        (r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"),
        (r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"),
        (r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"),
        
        # Closing/Politeness
        (r'\bhope\s+this\s+helps\b', "Conversational AI closing"),
        (r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"),
        (r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"),
        (r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"),
        (r'\bgreat\s+question[!,]?\b', "Conversational AI response"),
        (r'\b(excellent|good|great)\s+point\b', "Conversational AI response"),
        
        # Instructions/Meta-commentary
        (r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"),
        (r'\b(remember|note)\s+that\b', "Conversational AI instruction"),
        (r'\bplease\s+note\s+that\b', "Conversational AI instruction"),
    ]
    
    # Placeholder patterns
    PLACEHOLDER_PATTERNS = [
        (r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"),
        (r'\[add\s+[^\]]+\]', "Placeholder text"),
        (r'\[todo[:\s][^\]]*\]', "TODO placeholder"),
        (r'\btodo\s*:\s*.{0,50}', "TODO comment"),
        (r'\bfixme\s*:\s*.{0,50}', "FIXME comment"),
        (r'\bxxx\b', "XXX placeholder"),
        (r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"),
        (r'author[\s_-]*name', "Author name placeholder"),
        (r'your\.?email@example\.com', "Email placeholder"),
        (r'example@(example\.com|university\.edu)', "Email placeholder"),
        (r'\[citation\s+needed\]', "Citation needed placeholder"),
    ]
    
    # Markdown artifacts (should not appear in LaTeX)
    MARKDOWN_PATTERNS = [
        (r'^\s*#{1,6}\s+\w', "Markdown header"),
        (r'\*\*[^*]+\*\*', "Markdown bold"),
        (r'(?<!\*)\*[^*\s][^*]*[^*\s]\*(?!\*)', "Markdown italic"),
        (r'(?<!`)`[^`\n]+`(?!`)', "Markdown inline code"),
        (r'```[\s\S]*?```', "Markdown code block"),
        (r'^\s*[-*+]\s+\w', "Markdown bullet point"),
        (r'^\s*\d+\.\s+\w', "Markdown numbered list"),
        (r'\[([^\]]+)\]\(([^)]+)\)', "Markdown link"),
    ]
    

    
    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
        results = []
        lines = tex_content.split('\n')
        
        # Track if we are inside a verbatim-like environment
        in_verbatim = False
        verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox']
        
        # Check each line
        for line_num, line in enumerate(lines, 1):
            # Check for environment boundaries
            # Handle \begin{env}
            if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
                in_verbatim = True
                continue # Skip the begin line itself
            
            # Handle \end{env}
            if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
                in_verbatim = False
                continue # Skip the end line itself
                
            # Skip checks if inside verbatim environment
            if in_verbatim:
                continue
                
            # Skip commented lines using base class method
            if self._is_comment_line(line):
                continue
            
            # Remove inline comments for checking using base class method
            line_to_check = self._remove_line_comment(line)
            
            # Check AI conversation patterns
            for pattern, description in self.AI_CONVERSATION_PATTERNS:
                if re.search(pattern, line_to_check, re.IGNORECASE):
                    results.append(self._create_result(
                        passed=False,
                        severity=CheckSeverity.ERROR,
                        message=f"{description} detected",
                        line_number=line_num,
                        line_content=line.strip()[:100],
                        suggestion="Remove AI-generated conversational text"
                    ))
                    break  # One match per line for this category
            
            # Check placeholder patterns
            for pattern, description in self.PLACEHOLDER_PATTERNS:
                match = re.search(pattern, line_to_check, re.IGNORECASE)
                if match:
                    results.append(self._create_result(
                        passed=False,
                        severity=CheckSeverity.WARNING,
                        message=f"{description}: '{match.group(0)[:50]}'",
                        line_number=line_num,
                        line_content=line.strip()[:100],
                        suggestion="Replace placeholder with actual content or remove"
                    ))
            
            # Check Markdown patterns (less strict - might be intentional in some cases)
            for pattern, description in self.MARKDOWN_PATTERNS:
                # Skip if line looks like a LaTeX command (starts with \)
                if line_to_check.strip().startswith('\\'):
                    continue
                
                # Special handling for bullet points: ensure space after
                if "bullet point" in description:
                    # Skip if it looks like a math subtraction or negative number
                    if re.search(r'[-+]\d', line_to_check):
                        continue
                    # Skip if inside math mode (simple heuristic)
                    if '$' in line_to_check:
                        continue
                
                # Special handling for italics: avoid matching math mode like $x*y$
                if "italic" in description:
                    if '$' in line_to_check:
                        continue
                
                if re.search(pattern, line_to_check):
                    results.append(self._create_result(
                        passed=False,
                        severity=CheckSeverity.INFO,
                        message=f"Possible {description} in LaTeX",
                        line_number=line_num,
                        line_content=line.strip()[:100],
                        suggestion="Convert to LaTeX formatting or remove if unintentional"
                    ))
        
        return results