Reframr-RFM-v1-Base / reframr /text_quality.py
OkeyMeta's picture
Release Reframr-RFM-v1-Base public checkpoint
2147ce8 verified
import re
REFRAMR_NAME_PATTERN = re.compile(r"\breframr\b", re.IGNORECASE)
LINE_ROLE_PREFIX_PATTERN = re.compile(
r"(?im)^\s*(?:user|assistant|human|system|bot|model|gpt)\s*:\s*"
)
STRUCTURAL_ROLE_PREFIX_PATTERN = re.compile(
r"(?i)(<(?:reason|answer)>\s+)(?:user|assistant|human|system|bot|model|gpt)\s*:\s*"
)
SYSTEM_SCAFFOLD_LINE_PATTERN = re.compile(
r"(?i)^\s*(?:"
r"you\s+are\s+(?:an?\s+)?(?:helpful\s+)?(?:ai\s+)?assistant\b.*|"
r"your\s+role\s+as\s+an\s+assistant\s+involves\b.*|"
r"you\s+will\s+be\s+given\s+a\s+task\b.*|"
r"your\s+goal\s+is\s+to\s+complete\s+the\s+task\b.*|"
r"you\s+must\s+generate\s+a\s+detailed\s+and\s+long\s+answer\b.*|"
r"please\s+structure\s+your\s+response\s+into\s+two\s+main\s+sections\b.*|"
r"in\s+the\s+thought\s+section\b.*|"
r"in\s+the\s+solution\s+section\b.*|"
r"now,\s*try\s+to\s+solve\s+the\s+following\s+question\b.*|"
r"while\s+answering\s+think\s+step\s*[- ]?\s*by\s*[- ]?\s*step\b.*|"
r"think\s+like\s+you\s+are\s+answering\b.*"
r")\s*$"
)
OPEN_SOLUTION_PATTERN = re.compile(
r"(?is)<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
)
OPEN_THOUGHT_PATTERN = re.compile(
r"(?is)<\|begin_of_thought\|>.*?<\|end_of_thought\|>"
)
OPEN_TAG_PATTERN = re.compile(r"(?is)<\|[^>]+?\|>")
LEADING_ASSISTANT_FILLER_PATTERN = re.compile(
r"(?is)^\s*(?:sure(?:\s+thing)?|certainly|absolutely|of\s+course|yes)\s*[!,.:-]*\s+"
)
MOJIBAKE_MARKERS = ("â", "Ã", "Â", "â", "Ã", "Â")
def canonicalize_reframr_name(text: str) -> str:
return REFRAMR_NAME_PATTERN.sub("Reframr", text)
def repair_common_mojibake(text: str) -> str:
repaired = text
for _ in range(3):
if not any(marker in repaired for marker in MOJIBAKE_MARKERS):
break
original_markers = sum(repaired.count(marker) for marker in MOJIBAKE_MARKERS)
best = repaired
best_markers = original_markers
for encoding in ("cp1252", "latin1"):
try:
candidate = repaired.encode(encoding).decode("utf-8")
except UnicodeError:
continue
candidate_markers = sum(candidate.count(marker) for marker in MOJIBAKE_MARKERS)
if candidate_markers < best_markers:
best = candidate
best_markers = candidate_markers
if best == repaired:
break
repaired = best
return repaired
def strip_role_prefixes(text: str) -> str:
cleaned = STRUCTURAL_ROLE_PREFIX_PATTERN.sub(r"\1", text)
return LINE_ROLE_PREFIX_PATTERN.sub("", cleaned).strip()
def strip_instruction_scaffold(text: str) -> str:
lines = []
for line in text.splitlines():
if SYSTEM_SCAFFOLD_LINE_PATTERN.match(line):
continue
lines.append(line)
return "\n".join(lines).strip()
def clean_training_text(text: str) -> str:
repaired = repair_common_mojibake(text)
return strip_role_prefixes(canonicalize_reframr_name(repaired)).strip()
def clean_context_text(text: str) -> str:
return strip_instruction_scaffold(clean_training_text(text))
def clean_answer_text(text: str) -> str:
cleaned = clean_training_text(text)
solution_match = OPEN_SOLUTION_PATTERN.search(cleaned)
if solution_match:
cleaned = solution_match.group(1)
else:
cleaned = OPEN_THOUGHT_PATTERN.sub("", cleaned)
cleaned = OPEN_TAG_PATTERN.sub("", cleaned)
cleaned = LEADING_ASSISTANT_FILLER_PATTERN.sub("", cleaned)
return cleaned.strip()