import re REFRAMR_NAME_PATTERN = re.compile(r"\breframr\b", re.IGNORECASE) LINE_ROLE_PREFIX_PATTERN = re.compile( r"(?im)^\s*(?:user|assistant|human|system|bot|model|gpt)\s*:\s*" ) STRUCTURAL_ROLE_PREFIX_PATTERN = re.compile( r"(?i)(<(?:reason|answer)>\s+)(?:user|assistant|human|system|bot|model|gpt)\s*:\s*" ) SYSTEM_SCAFFOLD_LINE_PATTERN = re.compile( r"(?i)^\s*(?:" r"you\s+are\s+(?:an?\s+)?(?:helpful\s+)?(?:ai\s+)?assistant\b.*|" r"your\s+role\s+as\s+an\s+assistant\s+involves\b.*|" r"you\s+will\s+be\s+given\s+a\s+task\b.*|" r"your\s+goal\s+is\s+to\s+complete\s+the\s+task\b.*|" r"you\s+must\s+generate\s+a\s+detailed\s+and\s+long\s+answer\b.*|" r"please\s+structure\s+your\s+response\s+into\s+two\s+main\s+sections\b.*|" r"in\s+the\s+thought\s+section\b.*|" r"in\s+the\s+solution\s+section\b.*|" r"now,\s*try\s+to\s+solve\s+the\s+following\s+question\b.*|" r"while\s+answering\s+think\s+step\s*[- ]?\s*by\s*[- ]?\s*step\b.*|" r"think\s+like\s+you\s+are\s+answering\b.*" r")\s*$" ) OPEN_SOLUTION_PATTERN = re.compile( r"(?is)<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>" ) OPEN_THOUGHT_PATTERN = re.compile( r"(?is)<\|begin_of_thought\|>.*?<\|end_of_thought\|>" ) OPEN_TAG_PATTERN = re.compile(r"(?is)<\|[^>]+?\|>") LEADING_ASSISTANT_FILLER_PATTERN = re.compile( r"(?is)^\s*(?:sure(?:\s+thing)?|certainly|absolutely|of\s+course|yes)\s*[!,.:-]*\s+" ) MOJIBAKE_MARKERS = ("â", "Ã", "Â", "â", "Ã", "Â") def canonicalize_reframr_name(text: str) -> str: return REFRAMR_NAME_PATTERN.sub("Reframr", text) def repair_common_mojibake(text: str) -> str: repaired = text for _ in range(3): if not any(marker in repaired for marker in MOJIBAKE_MARKERS): break original_markers = sum(repaired.count(marker) for marker in MOJIBAKE_MARKERS) best = repaired best_markers = original_markers for encoding in ("cp1252", "latin1"): try: candidate = repaired.encode(encoding).decode("utf-8") except UnicodeError: continue candidate_markers = sum(candidate.count(marker) for marker in MOJIBAKE_MARKERS) if candidate_markers < best_markers: best = candidate best_markers = candidate_markers if best == repaired: break repaired = best return repaired def strip_role_prefixes(text: str) -> str: cleaned = STRUCTURAL_ROLE_PREFIX_PATTERN.sub(r"\1", text) return LINE_ROLE_PREFIX_PATTERN.sub("", cleaned).strip() def strip_instruction_scaffold(text: str) -> str: lines = [] for line in text.splitlines(): if SYSTEM_SCAFFOLD_LINE_PATTERN.match(line): continue lines.append(line) return "\n".join(lines).strip() def clean_training_text(text: str) -> str: repaired = repair_common_mojibake(text) return strip_role_prefixes(canonicalize_reframr_name(repaired)).strip() def clean_context_text(text: str) -> str: return strip_instruction_scaffold(clean_training_text(text)) def clean_answer_text(text: str) -> str: cleaned = clean_training_text(text) solution_match = OPEN_SOLUTION_PATTERN.search(cleaned) if solution_match: cleaned = solution_match.group(1) else: cleaned = OPEN_THOUGHT_PATTERN.sub("", cleaned) cleaned = OPEN_TAG_PATTERN.sub("", cleaned) cleaned = LEADING_ASSISTANT_FILLER_PATTERN.sub("", cleaned) return cleaned.strip()