| """ |
| RAE Tokenizer Utilities |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| Phase-aware tokenization for RAE training data. |
| |
| Handles the special structure of RAE responses where XML-style |
| phase tags delineate cognitive phases. Ensures proper tokenization |
| of phase boundaries and provides utilities for phase-level analysis. |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| """ |
|
|
| from typing import Optional |
| import re |
|
|
|
|
| PHASE_TAGS = { |
| "saturation": ("<SATURATION>", "</SATURATION>"), |
| "abstraction": ("<ABSTRACTION>", "</ABSTRACTION>"), |
| "descent": ("<DESCENT>", "</DESCENT>"), |
| "integration": ("<INTEGRATION>", "</INTEGRATION>"), |
| } |
|
|
| ALL_TAGS = [] |
| for open_tag, close_tag in PHASE_TAGS.values(): |
| ALL_TAGS.extend([open_tag, close_tag]) |
|
|
|
|
| def add_rae_tokens(tokenizer): |
| """ |
| Add RAE phase tags as special tokens to the tokenizer. |
| |
| This ensures phase boundaries are tokenized as single tokens |
| rather than being split across subwords, which makes phase |
| detection much more reliable during loss computation. |
| """ |
| special_tokens = {"additional_special_tokens": ALL_TAGS} |
| num_added = tokenizer.add_special_tokens(special_tokens) |
| |
| if num_added > 0: |
| print(f" Added {num_added} RAE phase tokens to tokenizer") |
| |
| return tokenizer, num_added |
|
|
|
|
| def extract_phases(text: str) -> dict[str, str]: |
| """Extract phase content from RAE-structured text.""" |
| phases = {} |
| for phase_name, (open_tag, close_tag) in PHASE_TAGS.items(): |
| pattern = re.escape(open_tag) + r"(.*?)" + re.escape(close_tag) |
| match = re.search(pattern, text, re.DOTALL) |
| phases[phase_name] = match.group(1).strip() if match else "" |
| return phases |
|
|
|
|
| def validate_rae_response(text: str) -> dict: |
| """ |
| Validate that a response contains proper RAE structure. |
| |
| Returns a report with: |
| - is_valid: bool |
| - phases_found: list of phase names found |
| - phases_missing: list of phase names missing |
| - compression_ratio: abstraction_len / saturation_len |
| - warnings: list of potential issues |
| """ |
| phases = extract_phases(text) |
| found = [name for name, content in phases.items() if content] |
| missing = [name for name, content in phases.items() if not content] |
| |
| warnings = [] |
| |
| |
| if found: |
| expected_order = ["saturation", "abstraction", "descent", "integration"] |
| found_order = [p for p in expected_order if p in found] |
| if found_order != [p for p in found if p in expected_order]: |
| warnings.append("Phases appear out of order") |
| |
| |
| compression_ratio = None |
| sat_len = len(phases.get("saturation", "").split()) |
| abs_len = len(phases.get("abstraction", "").split()) |
| if sat_len > 0: |
| compression_ratio = abs_len / sat_len |
| if compression_ratio > 1.0: |
| warnings.append(f"Abstraction is LONGER than Saturation (ratio={compression_ratio:.2f})") |
| |
| |
| for phase_name, content in phases.items(): |
| word_count = len(content.split()) |
| if content and word_count < 10: |
| warnings.append(f"{phase_name} is very short ({word_count} words)") |
| if content and word_count > 1000: |
| warnings.append(f"{phase_name} is very long ({word_count} words)") |
| |
| return { |
| "is_valid": len(found) == 4 and len(warnings) == 0, |
| "phases_found": found, |
| "phases_missing": missing, |
| "phase_lengths": {name: len(content.split()) for name, content in phases.items()}, |
| "compression_ratio": compression_ratio, |
| "warnings": warnings, |
| } |
|
|
|
|
| def format_rae_chat( |
| system_prompt: str, |
| user_message: str, |
| phases: dict[str, str], |
| tokenizer=None, |
| ) -> str: |
| """ |
| Format RAE phases into a chat-template-ready message. |
| |
| If tokenizer is provided, applies the chat template. |
| Otherwise returns raw message list. |
| """ |
| assistant_content = "" |
| for phase_name in ["saturation", "abstraction", "descent", "integration"]: |
| open_tag, close_tag = PHASE_TAGS[phase_name] |
| content = phases.get(phase_name, "") |
| assistant_content += f"{open_tag}\n{content}\n{close_tag}\n\n" |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_message}, |
| {"role": "assistant", "content": assistant_content.strip()}, |
| ] |
| |
| if tokenizer: |
| return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) |
| |
| return messages |
|
|