import re PLACEHOLDER_PATH_PATTERN = re.compile( r"(?i)\b(?:[a-z]:[\\/]|(?:\.{1,2}|[\w.-]+)[\\/])" r"[\w .-]+(?:[\\/][\w .-]+)*(?:\.(?:json|jsonl|csv|txt|md|py|js|ts|html|xml|yaml|yml))\b" ) MACHINE_ARTIFACT_PATTERN = re.compile( r"(?i)(?:" r"\b(?:null|undefined|nan)\b.*\b(?:null|undefined|nan)\b|" r"\b(?:stack\s*trace|traceback\s*\(|exception\s+in\s+thread)\b" r")" ) REFRAMR_NAME_PATTERN = re.compile(r"\breframr\b", re.IGNORECASE) LINE_ROLE_PREFIX_PATTERN = re.compile( r"(?im)^\s*(?:user|assistant|human|system|bot|model|gpt)\s*:\s*" ) STRUCTURAL_ROLE_PREFIX_PATTERN = re.compile( r"(?i)(<(?:reason|answer)>\s+)(?:user|assistant|human|system|bot|model|gpt)\s*:\s*" ) SYSTEM_SCAFFOLD_LINE_PATTERN = re.compile( r"(?i)^\s*(?:" r"you\s+are\s+(?:an?\s+)?(?:helpful\s+)?(?:ai\s+)?assistant\b.*|" r"your\s+role\s+as\s+an\s+assistant\s+involves\b.*|" r"you\s+will\s+be\s+given\s+a\s+task\b.*|" r"your\s+goal\s+is\s+to\s+complete\s+the\s+task\b.*|" r"you\s+must\s+generate\s+a\s+detailed\s+and\s+long\s+answer\b.*|" r"please\s+structure\s+your\s+response\s+into\s+two\s+main\s+sections\b.*|" r"in\s+the\s+thought\s+section\b.*|" r"in\s+the\s+solution\s+section\b.*|" r"now,\s*try\s+to\s+solve\s+the\s+following\s+question\b.*|" r"while\s+answering\s+think\s+step\s*[- ]?\s*by\s*[- ]?\s*step\b.*|" r"think\s+like\s+you\s+are\s+answering\b.*" r")\s*$" ) OPEN_SOLUTION_PATTERN = re.compile( r"(?is)<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>" ) OPEN_THOUGHT_PATTERN = re.compile( r"(?is)<\|begin_of_thought\|>.*?<\|end_of_thought\|>" ) OPEN_TAG_PATTERN = re.compile(r"(?is)<\|[^>]+?\|>") LEADING_ASSISTANT_FILLER_PATTERN = re.compile( r"(?is)^\s*(?:sure(?:\s+thing)?|certainly|absolutely|of\s+course|yes)\s*[!,.:-]*\s+" ) MOJIBAKE_MARKERS = ("â", "Ã", "Â", "â", "Ã", "Â") def canonicalize_reframr_name(text: str) -> str: return REFRAMR_NAME_PATTERN.sub("Reframr", text) def repair_common_mojibake(text: str) -> str: repaired = text for _ in range(3): if not any(marker in repaired for marker in MOJIBAKE_MARKERS): break original_markers = sum(repaired.count(marker) for marker in MOJIBAKE_MARKERS) best = repaired best_markers = original_markers for encoding in ("cp1252", "latin1"): try: candidate = repaired.encode(encoding).decode("utf-8") except UnicodeError: continue candidate_markers = sum(candidate.count(marker) for marker in MOJIBAKE_MARKERS) if candidate_markers < best_markers: best = candidate best_markers = candidate_markers if best == repaired: break repaired = best return repaired def strip_role_prefixes(text: str) -> str: cleaned = STRUCTURAL_ROLE_PREFIX_PATTERN.sub(r"\1", text) return LINE_ROLE_PREFIX_PATTERN.sub("", cleaned).strip() def strip_instruction_scaffold(text: str) -> str: lines = [] for line in text.splitlines(): if SYSTEM_SCAFFOLD_LINE_PATTERN.match(line): continue lines.append(line) return "\n".join(lines).strip() def clean_training_text(text: str) -> str: repaired = repair_common_mojibake(text) return strip_role_prefixes(canonicalize_reframr_name(repaired)).strip() def clean_context_text(text: str) -> str: return strip_instruction_scaffold(clean_training_text(text)) def clean_answer_text(text: str) -> str: cleaned = clean_training_text(text) solution_match = OPEN_SOLUTION_PATTERN.search(cleaned) if solution_match: cleaned = solution_match.group(1) else: cleaned = OPEN_THOUGHT_PATTERN.sub("", cleaned) cleaned = OPEN_TAG_PATTERN.sub("", cleaned) cleaned = LEADING_ASSISTANT_FILLER_PATTERN.sub("", cleaned) return cleaned.strip() def has_machine_artifacts(text: str) -> bool: """Detect corpus rows that are dominated by logs, placeholders, or encoding debris.""" if not text: return False if any(marker in text for marker in MOJIBAKE_MARKERS): return True if PLACEHOLDER_PATH_PATTERN.search(text): return True return bool(MACHINE_ARTIFACT_PATTERN.search(text))