|
|
""" |
|
|
AI artifacts checker. |
|
|
|
|
|
Detects leftover text from AI writing assistants that should be removed |
|
|
before submission, such as: |
|
|
- Conversational responses ("Sure, here is...") |
|
|
- Placeholder text |
|
|
- Markdown formatting artifacts |
|
|
- Common AI response patterns |
|
|
""" |
|
|
import re |
|
|
from typing import List, Tuple |
|
|
|
|
|
from .base import BaseChecker, CheckResult, CheckSeverity |
|
|
|
|
|
|
|
|
class AIArtifactsChecker(BaseChecker): |
|
|
"""Detect AI-generated text artifacts that should be removed.""" |
|
|
|
|
|
name = "ai_artifacts" |
|
|
display_name = "AI Artifacts" |
|
|
description = "Detect leftover AI assistant text and placeholders" |
|
|
|
|
|
|
|
|
|
|
|
AI_CONVERSATION_PATTERNS = [ |
|
|
|
|
|
(r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"), |
|
|
(r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"), |
|
|
(r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"), |
|
|
(r'\bcertainly[!,]\s*here\b', "Conversational AI response"), |
|
|
(r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"), |
|
|
(r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"), |
|
|
|
|
|
|
|
|
(r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"), |
|
|
(r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"), |
|
|
(r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"), |
|
|
|
|
|
|
|
|
(r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"), |
|
|
(r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"), |
|
|
(r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"), |
|
|
|
|
|
|
|
|
(r'\bhope\s+this\s+helps\b', "Conversational AI closing"), |
|
|
(r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"), |
|
|
(r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"), |
|
|
(r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"), |
|
|
(r'\bgreat\s+question[!,]?\b', "Conversational AI response"), |
|
|
(r'\b(excellent|good|great)\s+point\b', "Conversational AI response"), |
|
|
|
|
|
|
|
|
(r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"), |
|
|
(r'\b(remember|note)\s+that\b', "Conversational AI instruction"), |
|
|
(r'\bplease\s+note\s+that\b', "Conversational AI instruction"), |
|
|
] |
|
|
|
|
|
|
|
|
PLACEHOLDER_PATTERNS = [ |
|
|
(r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"), |
|
|
(r'\[add\s+[^\]]+\]', "Placeholder text"), |
|
|
(r'\[todo[:\s][^\]]*\]', "TODO placeholder"), |
|
|
(r'\btodo\s*:\s*.{0,50}', "TODO comment"), |
|
|
(r'\bfixme\s*:\s*.{0,50}', "FIXME comment"), |
|
|
(r'\bxxx\b', "XXX placeholder"), |
|
|
(r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"), |
|
|
(r'author[\s_-]*name', "Author name placeholder"), |
|
|
(r'your\.?email@example\.com', "Email placeholder"), |
|
|
(r'example@(example\.com|university\.edu)', "Email placeholder"), |
|
|
(r'\[citation\s+needed\]', "Citation needed placeholder"), |
|
|
] |
|
|
|
|
|
|
|
|
MARKDOWN_PATTERNS = [ |
|
|
(r'^\s*#{1,6}\s+\w', "Markdown header"), |
|
|
(r'\*\*[^*]+\*\*', "Markdown bold"), |
|
|
(r'(?<!\*)\*[^*\s][^*]*[^*\s]\*(?!\*)', "Markdown italic"), |
|
|
(r'(?<!`)`[^`\n]+`(?!`)', "Markdown inline code"), |
|
|
(r'```[\s\S]*?```', "Markdown code block"), |
|
|
(r'^\s*[-*+]\s+\w', "Markdown bullet point"), |
|
|
(r'^\s*\d+\.\s+\w', "Markdown numbered list"), |
|
|
(r'\[([^\]]+)\]\(([^)]+)\)', "Markdown link"), |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: |
|
|
results = [] |
|
|
lines = tex_content.split('\n') |
|
|
|
|
|
|
|
|
in_verbatim = False |
|
|
verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox'] |
|
|
|
|
|
|
|
|
for line_num, line in enumerate(lines, 1): |
|
|
|
|
|
|
|
|
if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line): |
|
|
in_verbatim = True |
|
|
continue |
|
|
|
|
|
|
|
|
if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line): |
|
|
in_verbatim = False |
|
|
continue |
|
|
|
|
|
|
|
|
if in_verbatim: |
|
|
continue |
|
|
|
|
|
|
|
|
if self._is_comment_line(line): |
|
|
continue |
|
|
|
|
|
|
|
|
line_to_check = self._remove_line_comment(line) |
|
|
|
|
|
|
|
|
for pattern, description in self.AI_CONVERSATION_PATTERNS: |
|
|
if re.search(pattern, line_to_check, re.IGNORECASE): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.ERROR, |
|
|
message=f"{description} detected", |
|
|
line_number=line_num, |
|
|
line_content=line.strip()[:100], |
|
|
suggestion="Remove AI-generated conversational text" |
|
|
)) |
|
|
break |
|
|
|
|
|
|
|
|
for pattern, description in self.PLACEHOLDER_PATTERNS: |
|
|
match = re.search(pattern, line_to_check, re.IGNORECASE) |
|
|
if match: |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.WARNING, |
|
|
message=f"{description}: '{match.group(0)[:50]}'", |
|
|
line_number=line_num, |
|
|
line_content=line.strip()[:100], |
|
|
suggestion="Replace placeholder with actual content or remove" |
|
|
)) |
|
|
|
|
|
|
|
|
for pattern, description in self.MARKDOWN_PATTERNS: |
|
|
|
|
|
if line_to_check.strip().startswith('\\'): |
|
|
continue |
|
|
|
|
|
|
|
|
if "bullet point" in description: |
|
|
|
|
|
if re.search(r'[-+]\d', line_to_check): |
|
|
continue |
|
|
|
|
|
if '$' in line_to_check: |
|
|
continue |
|
|
|
|
|
|
|
|
if "italic" in description: |
|
|
if '$' in line_to_check: |
|
|
continue |
|
|
|
|
|
if re.search(pattern, line_to_check): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.INFO, |
|
|
message=f"Possible {description} in LaTeX", |
|
|
line_number=line_num, |
|
|
line_content=line.strip()[:100], |
|
|
suggestion="Convert to LaTeX formatting or remove if unintentional" |
|
|
)) |
|
|
|
|
|
return results |
|
|
|