File size: 4,293 Bytes
52da7b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | import re
PLACEHOLDER_PATH_PATTERN = re.compile(
r"(?i)\b(?:[a-z]:[\\/]|(?:\.{1,2}|[\w.-]+)[\\/])"
r"[\w .-]+(?:[\\/][\w .-]+)*(?:\.(?:json|jsonl|csv|txt|md|py|js|ts|html|xml|yaml|yml))\b"
)
MACHINE_ARTIFACT_PATTERN = re.compile(
r"(?i)(?:"
r"\b(?:null|undefined|nan)\b.*\b(?:null|undefined|nan)\b|"
r"\b(?:stack\s*trace|traceback\s*\(|exception\s+in\s+thread)\b"
r")"
)
REFRAMR_NAME_PATTERN = re.compile(r"\breframr\b", re.IGNORECASE)
LINE_ROLE_PREFIX_PATTERN = re.compile(
r"(?im)^\s*(?:user|assistant|human|system|bot|model|gpt)\s*:\s*"
)
STRUCTURAL_ROLE_PREFIX_PATTERN = re.compile(
r"(?i)(<(?:reason|answer)>\s+)(?:user|assistant|human|system|bot|model|gpt)\s*:\s*"
)
SYSTEM_SCAFFOLD_LINE_PATTERN = re.compile(
r"(?i)^\s*(?:"
r"you\s+are\s+(?:an?\s+)?(?:helpful\s+)?(?:ai\s+)?assistant\b.*|"
r"your\s+role\s+as\s+an\s+assistant\s+involves\b.*|"
r"you\s+will\s+be\s+given\s+a\s+task\b.*|"
r"your\s+goal\s+is\s+to\s+complete\s+the\s+task\b.*|"
r"you\s+must\s+generate\s+a\s+detailed\s+and\s+long\s+answer\b.*|"
r"please\s+structure\s+your\s+response\s+into\s+two\s+main\s+sections\b.*|"
r"in\s+the\s+thought\s+section\b.*|"
r"in\s+the\s+solution\s+section\b.*|"
r"now,\s*try\s+to\s+solve\s+the\s+following\s+question\b.*|"
r"while\s+answering\s+think\s+step\s*[- ]?\s*by\s*[- ]?\s*step\b.*|"
r"think\s+like\s+you\s+are\s+answering\b.*"
r")\s*$"
)
OPEN_SOLUTION_PATTERN = re.compile(
r"(?is)<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
)
OPEN_THOUGHT_PATTERN = re.compile(
r"(?is)<\|begin_of_thought\|>.*?<\|end_of_thought\|>"
)
OPEN_TAG_PATTERN = re.compile(r"(?is)<\|[^>]+?\|>")
LEADING_ASSISTANT_FILLER_PATTERN = re.compile(
r"(?is)^\s*(?:sure(?:\s+thing)?|certainly|absolutely|of\s+course|yes)\s*[!,.:-]*\s+"
)
MOJIBAKE_MARKERS = ("â", "Ã", "Â", "â", "Ã", "Â")
def canonicalize_reframr_name(text: str) -> str:
return REFRAMR_NAME_PATTERN.sub("Reframr", text)
def repair_common_mojibake(text: str) -> str:
repaired = text
for _ in range(3):
if not any(marker in repaired for marker in MOJIBAKE_MARKERS):
break
original_markers = sum(repaired.count(marker) for marker in MOJIBAKE_MARKERS)
best = repaired
best_markers = original_markers
for encoding in ("cp1252", "latin1"):
try:
candidate = repaired.encode(encoding).decode("utf-8")
except UnicodeError:
continue
candidate_markers = sum(candidate.count(marker) for marker in MOJIBAKE_MARKERS)
if candidate_markers < best_markers:
best = candidate
best_markers = candidate_markers
if best == repaired:
break
repaired = best
return repaired
def strip_role_prefixes(text: str) -> str:
cleaned = STRUCTURAL_ROLE_PREFIX_PATTERN.sub(r"\1", text)
return LINE_ROLE_PREFIX_PATTERN.sub("", cleaned).strip()
def strip_instruction_scaffold(text: str) -> str:
lines = []
for line in text.splitlines():
if SYSTEM_SCAFFOLD_LINE_PATTERN.match(line):
continue
lines.append(line)
return "\n".join(lines).strip()
def clean_training_text(text: str) -> str:
repaired = repair_common_mojibake(text)
return strip_role_prefixes(canonicalize_reframr_name(repaired)).strip()
def clean_context_text(text: str) -> str:
return strip_instruction_scaffold(clean_training_text(text))
def clean_answer_text(text: str) -> str:
cleaned = clean_training_text(text)
solution_match = OPEN_SOLUTION_PATTERN.search(cleaned)
if solution_match:
cleaned = solution_match.group(1)
else:
cleaned = OPEN_THOUGHT_PATTERN.sub("", cleaned)
cleaned = OPEN_TAG_PATTERN.sub("", cleaned)
cleaned = LEADING_ASSISTANT_FILLER_PATTERN.sub("", cleaned)
return cleaned.strip()
def has_machine_artifacts(text: str) -> bool:
"""Detect corpus rows that are dominated by logs, placeholders, or encoding debris."""
if not text:
return False
if any(marker in text for marker in MOJIBAKE_MARKERS):
return True
if PLACEHOLDER_PATH_PATTERN.search(text):
return True
return bool(MACHINE_ARTIFACT_PATTERN.search(text))
|