File size: 8,184 Bytes
46df5f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
"""
AI artifacts checker.
Detects leftover text from AI writing assistants that should be removed
before submission, such as:
- Conversational responses ("Sure, here is...")
- Placeholder text
- Markdown formatting artifacts
- Common AI response patterns
"""
import re
from typing import List, Tuple
from .base import BaseChecker, CheckResult, CheckSeverity
class AIArtifactsChecker(BaseChecker):
"""Detect AI-generated text artifacts that should be removed."""
name = "ai_artifacts"
display_name = "AI Artifacts"
description = "Detect leftover AI assistant text and placeholders"
# Conversational AI patterns (case insensitive)
# These are phrases that clearly indicate a dialogue between user and AI assistant
AI_CONVERSATION_PATTERNS = [
# Responses to requests
(r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"),
(r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"),
(r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"),
(r'\bcertainly[!,]\s*here\b', "Conversational AI response"),
(r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"),
(r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"),
# Self-identification
(r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"),
(r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"),
(r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"),
# Explanatory transitions typical of chat
(r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"),
(r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"),
(r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"),
# Closing/Politeness
(r'\bhope\s+this\s+helps\b', "Conversational AI closing"),
(r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"),
(r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"),
(r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"),
(r'\bgreat\s+question[!,]?\b', "Conversational AI response"),
(r'\b(excellent|good|great)\s+point\b', "Conversational AI response"),
# Instructions/Meta-commentary
(r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"),
(r'\b(remember|note)\s+that\b', "Conversational AI instruction"),
(r'\bplease\s+note\s+that\b', "Conversational AI instruction"),
]
# Placeholder patterns
PLACEHOLDER_PATTERNS = [
(r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"),
(r'\[add\s+[^\]]+\]', "Placeholder text"),
(r'\[todo[:\s][^\]]*\]', "TODO placeholder"),
(r'\btodo\s*:\s*.{0,50}', "TODO comment"),
(r'\bfixme\s*:\s*.{0,50}', "FIXME comment"),
(r'\bxxx\b', "XXX placeholder"),
(r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"),
(r'author[\s_-]*name', "Author name placeholder"),
(r'your\.?email@example\.com', "Email placeholder"),
(r'example@(example\.com|university\.edu)', "Email placeholder"),
(r'\[citation\s+needed\]', "Citation needed placeholder"),
]
# Markdown artifacts (should not appear in LaTeX)
MARKDOWN_PATTERNS = [
(r'^\s*#{1,6}\s+\w', "Markdown header"),
(r'\*\*[^*]+\*\*', "Markdown bold"),
(r'(?<!\*)\*[^*\s][^*]*[^*\s]\*(?!\*)', "Markdown italic"),
(r'(?<!`)`[^`\n]+`(?!`)', "Markdown inline code"),
(r'```[\s\S]*?```', "Markdown code block"),
(r'^\s*[-*+]\s+\w', "Markdown bullet point"),
(r'^\s*\d+\.\s+\w', "Markdown numbered list"),
(r'\[([^\]]+)\]\(([^)]+)\)', "Markdown link"),
]
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
results = []
lines = tex_content.split('\n')
# Track if we are inside a verbatim-like environment
in_verbatim = False
verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox']
# Check each line
for line_num, line in enumerate(lines, 1):
# Check for environment boundaries
# Handle \begin{env}
if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
in_verbatim = True
continue # Skip the begin line itself
# Handle \end{env}
if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
in_verbatim = False
continue # Skip the end line itself
# Skip checks if inside verbatim environment
if in_verbatim:
continue
# Skip commented lines using base class method
if self._is_comment_line(line):
continue
# Remove inline comments for checking using base class method
line_to_check = self._remove_line_comment(line)
# Check AI conversation patterns
for pattern, description in self.AI_CONVERSATION_PATTERNS:
if re.search(pattern, line_to_check, re.IGNORECASE):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.ERROR,
message=f"{description} detected",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Remove AI-generated conversational text"
))
break # One match per line for this category
# Check placeholder patterns
for pattern, description in self.PLACEHOLDER_PATTERNS:
match = re.search(pattern, line_to_check, re.IGNORECASE)
if match:
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message=f"{description}: '{match.group(0)[:50]}'",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Replace placeholder with actual content or remove"
))
# Check Markdown patterns (less strict - might be intentional in some cases)
for pattern, description in self.MARKDOWN_PATTERNS:
# Skip if line looks like a LaTeX command (starts with \)
if line_to_check.strip().startswith('\\'):
continue
# Special handling for bullet points: ensure space after
if "bullet point" in description:
# Skip if it looks like a math subtraction or negative number
if re.search(r'[-+]\d', line_to_check):
continue
# Skip if inside math mode (simple heuristic)
if '$' in line_to_check:
continue
# Special handling for italics: avoid matching math mode like $x*y$
if "italic" in description:
if '$' in line_to_check:
continue
if re.search(pattern, line_to_check):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message=f"Possible {description} in LaTeX",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Convert to LaTeX formatting or remove if unintentional"
))
return results
|