BibGuard / src /checkers /ai_artifacts_checker.py
thinkwee
init
46df5f0
"""
AI artifacts checker.
Detects leftover text from AI writing assistants that should be removed
before submission, such as:
- Conversational responses ("Sure, here is...")
- Placeholder text
- Markdown formatting artifacts
- Common AI response patterns
"""
import re
from typing import List, Tuple
from .base import BaseChecker, CheckResult, CheckSeverity
class AIArtifactsChecker(BaseChecker):
"""Detect AI-generated text artifacts that should be removed."""
name = "ai_artifacts"
display_name = "AI Artifacts"
description = "Detect leftover AI assistant text and placeholders"
# Conversational AI patterns (case insensitive)
# These are phrases that clearly indicate a dialogue between user and AI assistant
AI_CONVERSATION_PATTERNS = [
# Responses to requests
(r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"),
(r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"),
(r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"),
(r'\bcertainly[!,]\s*here\b', "Conversational AI response"),
(r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"),
(r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"),
# Self-identification
(r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"),
(r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"),
(r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"),
# Explanatory transitions typical of chat
(r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"),
(r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"),
(r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"),
# Closing/Politeness
(r'\bhope\s+this\s+helps\b', "Conversational AI closing"),
(r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"),
(r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"),
(r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"),
(r'\bgreat\s+question[!,]?\b', "Conversational AI response"),
(r'\b(excellent|good|great)\s+point\b', "Conversational AI response"),
# Instructions/Meta-commentary
(r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"),
(r'\b(remember|note)\s+that\b', "Conversational AI instruction"),
(r'\bplease\s+note\s+that\b', "Conversational AI instruction"),
]
# Placeholder patterns
PLACEHOLDER_PATTERNS = [
(r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"),
(r'\[add\s+[^\]]+\]', "Placeholder text"),
(r'\[todo[:\s][^\]]*\]', "TODO placeholder"),
(r'\btodo\s*:\s*.{0,50}', "TODO comment"),
(r'\bfixme\s*:\s*.{0,50}', "FIXME comment"),
(r'\bxxx\b', "XXX placeholder"),
(r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"),
(r'author[\s_-]*name', "Author name placeholder"),
(r'your\.?email@example\.com', "Email placeholder"),
(r'example@(example\.com|university\.edu)', "Email placeholder"),
(r'\[citation\s+needed\]', "Citation needed placeholder"),
]
# Markdown artifacts (should not appear in LaTeX)
MARKDOWN_PATTERNS = [
(r'^\s*#{1,6}\s+\w', "Markdown header"),
(r'\*\*[^*]+\*\*', "Markdown bold"),
(r'(?<!\*)\*[^*\s][^*]*[^*\s]\*(?!\*)', "Markdown italic"),
(r'(?<!`)`[^`\n]+`(?!`)', "Markdown inline code"),
(r'```[\s\S]*?```', "Markdown code block"),
(r'^\s*[-*+]\s+\w', "Markdown bullet point"),
(r'^\s*\d+\.\s+\w', "Markdown numbered list"),
(r'\[([^\]]+)\]\(([^)]+)\)', "Markdown link"),
]
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
results = []
lines = tex_content.split('\n')
# Track if we are inside a verbatim-like environment
in_verbatim = False
verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox']
# Check each line
for line_num, line in enumerate(lines, 1):
# Check for environment boundaries
# Handle \begin{env}
if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
in_verbatim = True
continue # Skip the begin line itself
# Handle \end{env}
if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
in_verbatim = False
continue # Skip the end line itself
# Skip checks if inside verbatim environment
if in_verbatim:
continue
# Skip commented lines using base class method
if self._is_comment_line(line):
continue
# Remove inline comments for checking using base class method
line_to_check = self._remove_line_comment(line)
# Check AI conversation patterns
for pattern, description in self.AI_CONVERSATION_PATTERNS:
if re.search(pattern, line_to_check, re.IGNORECASE):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.ERROR,
message=f"{description} detected",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Remove AI-generated conversational text"
))
break # One match per line for this category
# Check placeholder patterns
for pattern, description in self.PLACEHOLDER_PATTERNS:
match = re.search(pattern, line_to_check, re.IGNORECASE)
if match:
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message=f"{description}: '{match.group(0)[:50]}'",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Replace placeholder with actual content or remove"
))
# Check Markdown patterns (less strict - might be intentional in some cases)
for pattern, description in self.MARKDOWN_PATTERNS:
# Skip if line looks like a LaTeX command (starts with \)
if line_to_check.strip().startswith('\\'):
continue
# Special handling for bullet points: ensure space after
if "bullet point" in description:
# Skip if it looks like a math subtraction or negative number
if re.search(r'[-+]\d', line_to_check):
continue
# Skip if inside math mode (simple heuristic)
if '$' in line_to_check:
continue
# Special handling for italics: avoid matching math mode like $x*y$
if "italic" in description:
if '$' in line_to_check:
continue
if re.search(pattern, line_to_check):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message=f"Possible {description} in LaTeX",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Convert to LaTeX formatting or remove if unintentional"
))
return results