Spaces:

TAGOOZ
/

reasoning-trajectory-demo

Sleeping

File size: 41,454 Bytes
"""Utility functions"""

from __future__ import annotations
from typing import Any, Dict, List, Optional
from pathlib import Path
import json
import random
import numpy as np


def set_seed(seed: int) -> None:
    """Set random seed for reproducibility

    Args:
        seed: Random seed
    """
    random.seed(seed)
    np.random.seed(seed)

    try:
        import torch
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
    except ImportError:
        pass


def save_json(data: Any, filepath: str, indent: int = 2) -> None:
    """Save data to JSON file

    Args:
        data: Data to save
        filepath: Output file path
        indent: JSON indentation
    """
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)


def load_json(filepath: str) -> Any:
    """Load data from JSON file

    Args:
        filepath: Input file path

    Returns:
        Loaded data
    """
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)


def save_jsonl(data: List[Dict[str, Any]], filepath: str) -> None:
    """Save data to JSONL file

    Args:
        data: List of dictionaries to save
        filepath: Output file path
    """
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


def load_jsonl(filepath: str) -> List[Dict[str, Any]]:
    """Load data from JSONL file

    Args:
        filepath: Input file path

    Returns:
        List of dictionaries
    """
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


def last_boxed_only_string(text: str) -> Optional[str]:
    """Extract the last \\boxed{} expression from text (Hendrycks MATH standard)

    This is the canonical extractor used in:
    - Original MATH dataset (Hendrycks et al.)
    - lm-evaluation-harness
    - Minerva
    - Math-Verify

    Args:
        text: Text containing LaTeX \\boxed{} expressions

    Returns:
        Content of the last \\boxed{} expression, or None if not found
    """
    import re

    # Pattern for \boxed{} or \fbox{}
    # Use a simple pattern first for non-nested cases
    simple_pattern = r'\\(?:boxed|fbox)\{([^{}]+)\}'
    matches = list(re.finditer(simple_pattern, text))

    if matches:
        # Return content of last match
        return matches[-1].group(1).strip()

    # If no simple match, try to handle nested braces with brace counting
    # This handles cases like \boxed{\frac{a}{b}}
    boxed_starts = list(re.finditer(r'\\(?:boxed|fbox)\{', text))

    if not boxed_starts:
        return None

    # Process from the last \boxed{ occurrence
    last_start = boxed_starts[-1]
    start_pos = last_start.end()

    # Count braces to find matching closing brace
    depth = 1
    end_pos = start_pos

    for i in range(start_pos, len(text)):
        if text[i] == '{':
            depth += 1
        elif text[i] == '}':
            depth -= 1
            if depth == 0:
                end_pos = i
                break

    if depth == 0:
        content = text[start_pos:end_pos].strip()
        return content if content else None

    return None


def get_gold_answer_math(
    gold_answer_field: Optional[str] = None,
    gold_solution_text: Optional[str] = None,
) -> Optional[str]:
    """Get gold answer for MATH dataset in a dataset-agnostic way

    Args:
        gold_answer_field: Direct answer field (MATH-500 HuggingFace format)
        gold_solution_text: Solution text with \\boxed{} answer (original MATH format)

    Returns:
        Extracted gold answer string
    """
    # MATH-500 HuggingFace format: use answer field directly
    if gold_answer_field is not None:
        return gold_answer_field.strip()

    # Original Hendrycks MATH format: extract from solution using \\boxed{}
    if gold_solution_text is not None:
        ans = last_boxed_only_string(gold_solution_text)
        if ans:
            return ans

    return None


def extract_answer_before_hash_r1(text: str, task: str = "gsm8k") -> Optional[str]:
    """Extract answer that appears BEFORE #### marker (R1-specific pattern)

    R1 models often put the final answer right before ####, then regenerate after.
    Examples:
    - "Total time = 16 hours.\n####" → "16"
    - "The answer is \\boxed{72}.\n####" → "72"

    Args:
        text: Generated text containing #### marker
        task: Task type (gsm8k or math/math-500)

    Returns:
        Extracted answer from before ####, or None
    """
    import re

    if '####' not in text:
        return None

    # Get text before the last ####
    last_hash_idx = text.rfind('####')
    before_hash = text[:last_hash_idx].strip()

    if not before_hash:
        return None

    if task in ("math", "math-500"):
        # MATH: Look for \boxed{} before ####
        boxed_answer = last_boxed_only_string(before_hash)
        if boxed_answer:
            return boxed_answer

        # Fallback: extract from last line before ####
        lines = before_hash.split('\n')
        for line in reversed(lines[-3:]):  # Check last 3 lines
            line = line.strip()
            if line and not line.startswith('Step'):
                # Try to extract LaTeX expression or number
                # Remove common prefixes
                for prefix in ['So,', 'Therefore,', 'Thus,', 'Hence,', 'The answer is', 'Final answer:']:
                    if line.startswith(prefix):
                        line = line[len(prefix):].strip()
                        break

                # Clean and return if reasonable length
                line = line.rstrip('.')
                if line and len(line) < 100:
                    return line

        return None

    else:  # GSM8K
        # Look for the first number before ####, then extend backwards
        # Get last 200 chars before #### to avoid scanning entire text
        snippet = before_hash[-200:] if len(before_hash) > 200 else before_hash

        # Find all numbers in the snippet
        # Pattern: optional $, then number with optional commas/spaces
        number_pattern = r'\$?\s*(-?\d+(?:[,\s]\d+)*(?:\.\d+)?)'
        matches = list(re.finditer(number_pattern, snippet))

        if not matches:
            return None

        # Get the last number before ####
        last_match = matches[-1]
        number_str = last_match.group(1).replace(',', '').replace(' ', '')

        # Now look backwards from this number to find the start of the answer phrase
        # This helps extract "16 hours" instead of just "16"
        match_start = last_match.start()
        phrase_start = match_start

        # Look back up to 50 chars or until we hit punctuation/newline
        lookback_start = max(0, match_start - 50)
        for i in range(match_start - 1, lookback_start - 1, -1):
            char = snippet[i]
            if char in '.!?\n':
                phrase_start = i + 1
                break
            elif char in '=':
                # Include the part after = (e.g., "= 16")
                phrase_start = i
                break

        # Extract the phrase containing the number
        phrase = snippet[phrase_start:last_match.end()].strip()

        # Clean up the phrase
        phrase = phrase.lstrip('=').strip()
        phrase = phrase.lstrip('$').strip()

        # For GSM8K, we typically want just the number
        # But first try to extract it cleanly from the phrase
        clean_number = re.search(r'(-?\d+(?:\.\d+)?)', phrase)
        if clean_number:
            return clean_number.group(1)

        return number_str


def extract_answer(
    text: str,
    task: str = "gsm8k",
    use_r1_fallback: bool = False,
    gold_answer: Optional[str] = None
) -> Optional[str]:
    """Extract final answer from generated text (ROBUST version with MATH-specific improvements)

    Handles:
    - GSM8K format: "#### 36 + 3 = 39" → "39"
    - MATH format: "\\boxed{42}" or "\\boxed{x \\in [-2,7]}" → "42" or "x \\in [-2,7]"
    - MATH with ####: "#### $answer$" or "#### answer"
    - R1-specific: Answer before #### when use_r1_fallback=True
    - Comma-separated numbers: "#### 1,234" → "1234"
    - Currency symbols: "#### $70" → "70"
    - Negative numbers: "#### -42" → "-42"
    - Spaces between digits: "#### 118 000" → "118000"
    - Text after number: "#### 70 dollars" → "70"
    - Recursive operators: "#### 36 + 3 = 39 + 1 = 40" → "40"
    - Repetition detection: Stops before repetitive patterns
    - Fallback: Extracts from last coherent step

    Args:
        text: Generated text
        task: Task/dataset name (gsm8k, math-500, math, etc.)
        use_r1_fallback: If True, try extracting answer before #### when after-#### extraction
                        fails or is incorrect (for R1 models)
        gold_answer: Optional gold answer for R1 fallback validation

    Returns:
        Extracted answer or None
    """
    if not text:
        return None

    import re

    # Validate and normalize task name
    task = task.lower().strip()
    if task not in ("gsm8k", "math", "math-500"):
        # Treat unknown tasks as generic numeric extraction
        task = "gsm8k"

    # Helper function: Detect repetitive pattern and truncate
    def remove_repetition(txt: str) -> str:
        """Remove repetitive patterns from end of text"""
        if len(txt) < 100:
            return txt

        # Check for exact repetition of substrings at the end
        for pattern_len in [20, 30, 40, 50]:
            if len(txt) < pattern_len * 3:
                continue
            pattern = txt[-pattern_len:]
            # Count how many times this pattern repeats at the end
            count = 1
            pos = len(txt) - pattern_len * 2
            while pos >= 0 and txt[pos:pos+pattern_len] == pattern:
                count += 1
                pos -= pattern_len

            if count >= 3:  # If pattern repeats 3+ times, truncate
                return txt[:pos + pattern_len]

        # Check for "Step N: ####" repetition (common failure mode)
        if txt.count('Step ') > 10:
            # Find where "Step N: ####" pattern starts repeating
            matches = list(re.finditer(r'Step \d+: ####', txt))
            if len(matches) >= 5:  # If we see this pattern 5+ times
                # Truncate at the first occurrence of this pattern
                return txt[:matches[0].start()]

        return txt

    # Helper function: Extract last number from text as fallback
    def extract_last_number(txt: str) -> Optional[str]:
        all_numbers = re.findall(r'(-?\d+(?:[\s,]\d+)*(?:\.\d+)?)', txt)
        if all_numbers:
            return all_numbers[-1].replace(' ', '').replace(',', '')
        return None

    # Helper function: Extract answer from #### marker (MATH-aware)
    def extract_from_hash_marker(txt: str) -> Optional[str]:
        """Extract answer after #### marker, handling MATH-specific formats"""
        if '####' not in txt:
            return None

        # Split by #### and get the LAST occurrence (most recent answer)
        parts = txt.split('####')
        answer_part = parts[-1].strip()
        before_hash = parts[-2] if len(parts) >= 2 else ""

        # Check if answer_part is actually useful
        # It's not useful if: empty, too long, starts with "Step", or is a full sentence without clear answer
        answer_looks_like_sentence = (
            answer_part and
            len(answer_part.split()) > 5 and  # More than 5 words
            not answer_part[0].isdigit() and  # Doesn't start with a number
            not answer_part.startswith('$') and  # Doesn't start with LaTeX
            ('the' in answer_part.lower() or 'is' in answer_part.lower())  # Contains common sentence words
        )

        if not answer_part or len(answer_part) > 200 or answer_part.startswith('Step') or answer_looks_like_sentence:
            # If nothing useful after ####, or it's too long (likely paragraph), or starts with Step, or looks like a sentence
            # Try to extract from the text BEFORE #### (look for "Final Answer:" or similar)
            if before_hash:
                # Look for common answer indicators in the last 300 chars before ####
                before_snippet = before_hash[-300:] if len(before_hash) > 300 else before_hash

                # Try to find "Final Answer:", "Answer:", etc.
                answer_patterns = [
                    r'Final Answer:\s*([^\n]+)',
                    r'Answer:\s*([^\n]+)',
                    r'Therefore,?\s+(?:the answer is|we get|we have)\s*:?\s*([^\n]+)',
                    r'(?:Thus|Hence|So),?\s+(?:the answer is|we get|we have)\s*:?\s*([^\n]+)',
                ]

                for pattern in answer_patterns:
                    match = re.search(pattern, before_snippet, re.IGNORECASE)
                    if match:
                        potential_answer = match.group(1).strip()
                        # Clean up the extracted answer
                        potential_answer = potential_answer.rstrip('.,;:')
                        # Extract just the number/value if it's a sentence
                        words = potential_answer.split()
                        if len(words) > 3:  # If it's a phrase, extract the first number/value
                            # Look for a number at the start
                            number_match = re.match(r'^(-?\d+(?:\.\d+)?)', potential_answer)
                            if number_match:
                                return number_match.group(1)
                        return potential_answer

        # Continue with normal after-#### processing
        if not answer_part:
            # If last #### has nothing after it, try second-to-last
            if len(parts) >= 2:
                answer_part = parts[-2].strip()
            if not answer_part:
                return None

        # Skip if it starts with "Step" (incomplete generation)
        if answer_part.startswith('Step'):
            # Try to find a #### that's NOT followed by Step
            for i in range(len(parts) - 1, 0, -1):
                candidate = parts[i].strip()
                if candidate and not candidate.startswith('Step'):
                    answer_part = candidate
                    break
            else:
                return None

        # Format 1: #### $latex_expression$
        # Extract content between first $ and last $ on same line
        first_line = answer_part.split('\n')[0].strip()
        if first_line.startswith('$') and first_line.count('$') >= 2:
            # Extract content between first and last $
            dollar_content = first_line[1:]  # Remove first $
            if '$' in dollar_content:
                dollar_content = dollar_content[:dollar_content.rfind('$')]  # Remove last $
                dollar_content = dollar_content.strip()

                # If it's an equation like "f(x)=5", extract the value after =
                if '=' in dollar_content:
                    # Split by = and get the rightmost part
                    eq_parts = dollar_content.split('=')
                    # Get last non-empty part
                    for part in reversed(eq_parts):
                        part = part.strip()
                        if part:
                            return part

                return dollar_content

        # Format 2: #### plain answer (number or expression)
        # Take first line only, clean it
        first_line = answer_part.split('\n')[0].strip()

        # Remove trailing/leading $ if present
        first_line = first_line.strip('$')

        # Remove markdown formatting (**, *, etc.)
        first_line = re.sub(r'\*\*([^*]+)\*\*', r'\1', first_line)  # **text** → text
        first_line = re.sub(r'\*([^*]+)\*', r'\1', first_line)      # *text* → text
        first_line = first_line.strip()

        # Remove common prefixes that indicate an answer
        prefixes_to_remove = [
            'The final answer is',
            'The answer is',
            'Final Answer:',
            'Final Answer',
            'Answer:',
            'Answer',
            'Therefore,',
            'Thus,',
            'So,',
            'Hence,',
        ]

        for prefix in prefixes_to_remove:
            if first_line.startswith(prefix):
                first_line = first_line[len(prefix):].strip()
                # Remove trailing punctuation and colons after removing prefix
                first_line = first_line.lstrip(':').strip().rstrip('.,;:')
                break

        # Remove common suffixes like explanations
        first_line = re.split(r'\n|Explanation:|Note:|Solution:', first_line)[0].strip()

        # If what remains is a sentence (contains many words) OR has text after a number, extract just the number
        # This handles cases like "3 treeks", "70 dollars", "The combined weight of three treeks equals the weight of one squig."
        words = first_line.split()
        if len(words) >= 2:  # Has multiple words (number + text), try to extract just the value
            # Look for numbers in the text
            numbers = re.findall(r'-?\d+(?:\.\d+)?', first_line)
            if numbers:
                # If there's only one number, return it
                if len(numbers) == 1:
                    return numbers[0]
                # If multiple numbers, try to find the most likely answer (usually the last meaningful one)
                return numbers[-1]
            # If no numbers, try to extract key mathematical value
            # Look for pattern like "X equals Y" or "X = Y" or "has X units"
            if '=' in first_line or 'equals' in first_line.lower() or 'has' in first_line.lower():
                split_parts = re.split(r'=|equals|has', first_line, flags=re.IGNORECASE)
                if len(split_parts) >= 2:
                    last_part = split_parts[-1].strip().rstrip('.,;:')
                    # Try to extract number or short expression
                    cleaned = re.sub(r'\b(the|a|an|one|of|to|is|are)\b', '', last_part, flags=re.IGNORECASE).strip()
                    # Extract first number from cleaned part
                    num_match = re.search(r'-?\d+(?:\.\d+)?', cleaned)
                    if num_match:
                        return num_match.group(0)
                    if cleaned and len(cleaned) < 20:
                        return cleaned

        return first_line if first_line else None

    # Helper function: Extract from last coherent step
    def extract_from_last_step(txt: str) -> Optional[str]:
        """Extract answer from the last coherent step before repetition"""
        # Find all Step N: patterns
        step_matches = list(re.finditer(r'Step \d+:', txt))
        if not step_matches:
            return None

        # Get the last few steps
        for match in reversed(step_matches[-5:]):  # Check last 5 steps
            start = match.end()
            # Find end of this step (next Step or end of text)
            next_match_idx = step_matches.index(match) + 1
            if next_match_idx < len(step_matches):
                end = step_matches[next_match_idx].start()
            else:
                end = len(txt)

            step_content = txt[start:end].strip()

            # Look for equations or expressions ending with $ on a line by itself
            lines = step_content.split('\n')
            for line in reversed(lines):
                line = line.strip()
                # Check if line contains LaTeX expression
                if '$' in line and not line.startswith('Step'):
                    # Extract from $...$
                    if line.count('$') >= 2:
                        dollar_parts = line.split('$')
                        for part in reversed(dollar_parts):
                            part = part.strip()
                            if part and not part.startswith('Step'):
                                # Check if this looks like an answer (not a full sentence)
                                if len(part) < 50 and ('=' in part or '\\' in part or part.replace('.', '').replace('-', '').replace(',', '').replace(' ', '').replace('/', '').isalnum()):
                                    # Extract the value after = if present
                                    if '=' in part:
                                        after_eq = part.split('=')[-1].strip()
                                        if after_eq:
                                            return after_eq
                                    return part

        return None

    if task in ("math-500", "math"):
        # MATH format: For gold answers, the text IS the answer (may contain LaTeX)
        # For generated text, use multi-strategy extraction

        # Remove repetitive patterns first
        text = remove_repetition(text)

        # Strategy 1: Look for \boxed{} notation (official MATH format, highest priority)
        # Use the canonical last_boxed_only_string extractor
        boxed_answer = last_boxed_only_string(text)
        if boxed_answer:
            result = boxed_answer
        # Note: If boxed_answer is None, this is a formatting failure for MATH.
        # Upstream code may want to track n_with_boxed vs n_without_boxed
        # to distinguish formatting quality from reasoning quality.

        # Strategy 2: Look for #### marker (GSM8K-style, but model sometimes uses it)
        elif '####' in text:
            hash_answer = extract_from_hash_marker(text)
            if hash_answer:
                result = hash_answer
            else:
                result = None
        # Strategy 3: If text looks like a direct answer (gold answer case)
        # This handles gold answers like "\frac{14}{3}", "x \\in [-2,7]", or "p - q"
        elif not ("Step" in text or len(text) > 200):
            # Likely a gold answer - return as-is after light cleanup
            result = text.strip()
        else:
            # Strategy 4: Extract from last coherent step
            step_answer = extract_from_last_step(text)
            if step_answer:
                result = step_answer
            else:
                # Strategy 5: For MATH, do NOT default to last numeric token
                result = None

        # R1 FALLBACK for MATH: Check if we should try before-hash extraction
        if use_r1_fallback and '####' in text:
            should_try_r1 = False
            if result is None:
                should_try_r1 = True
            elif gold_answer is not None and not evaluate_math_answer(result, gold_answer):
                should_try_r1 = True

            if should_try_r1:
                r1_answer = extract_answer_before_hash_r1(text, task)
                if r1_answer and (gold_answer is None or evaluate_math_answer(r1_answer, gold_answer)):
                    return r1_answer

        return result

    if task == "gsm8k":
        # GSM8K format: answer is after ####
        # Use the shared extract_from_hash_marker helper for consistency
        from_hash = extract_from_hash_marker(text)

        if from_hash is None:
            # No #### found or extraction failed, try last number
            result = extract_last_number(text)
        else:
            # For GSM8K, enforce "single number" semantics
            # Normalize to extract just the numeric value
            num = normalize_answer(from_hash)
            result = num if num != "" else extract_last_number(text)

        # R1 FALLBACK for GSM8K: Check if we should try before-hash extraction
        if use_r1_fallback and '####' in text:
            should_try_r1 = False
            if result is None:
                should_try_r1 = True
            elif gold_answer is not None and not evaluate_answer(result, gold_answer):
                should_try_r1 = True

            if should_try_r1:
                r1_answer = extract_answer_before_hash_r1(text, task)
                if r1_answer and (gold_answer is None or evaluate_answer(r1_answer, gold_answer)):
                    return r1_answer

        return result

    # Generic extraction: try to find last number in entire text
    return extract_last_number(text)


def normalize_answer(answer: str) -> str:
    """Normalize answer for comparison

    Removes spaces, commas, currency symbols and converts to standard number format.
    Extracts first number if answer contains non-numeric text.

    Args:
        answer: Answer string

    Returns:
        Normalized answer string
    """
    if not answer:
        return ""

    import re

    # First try to extract just the number (handles "70 dollars", etc.)
    # Remove leading currency symbols
    answer = re.sub(r'^[\$£€¥₹]+\s*', '', answer.strip())

    # Extract first number (with optional decimal point and commas)
    number_match = re.match(r'(-?[\d,]+(?:\.\d+)?)', answer)
    if number_match:
        answer = number_match.group(1)

    # Remove remaining spaces and commas
    answer = re.sub(r'[\s,]', '', answer)

    # Try to convert to number and normalize
    try:
        num = float(answer)
        if num.is_integer():
            return str(int(num))
        else:
            return str(num)
    except (ValueError, TypeError):
        return answer.strip()


def answers_match(answer1: str, answer2: str) -> bool:
    """Check if two answers match after normalization

    Args:
        answer1: First answer
        answer2: Second answer

    Returns:
        True if answers match after normalization
    """
    return normalize_answer(answer1) == normalize_answer(answer2)


def evaluate_answer(
    predicted: str,
    ground_truth: str,
    tolerance: float = 1e-5,
) -> bool:
    """Evaluate if predicted answer matches ground truth (robust version)

    Uses normalization to handle:
    - Different spacing: "1 000" vs "1000"
    - Commas: "1,000" vs "1000"
    - Currency: "$50" vs "50"
    - Trailing text: "72 clips" vs "72"
    - Decimal vs integer: "72.0" vs "72"

    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
        tolerance: Numerical tolerance (not used with normalization, kept for API compatibility)

    Returns:
        True if answers match
    """
    if not predicted or not ground_truth:
        return False

    # Use robust normalization and matching
    return answers_match(predicted, ground_truth)


def math_normalize_answer(answer: str) -> str:
    """Normalize MATH dataset answer for comparison

    Handles:
    - LaTeX expressions: \\frac{a}{b}, \\sqrt{x}, etc.
    - Intervals: [a,b], (a,b), x \\in [a,b]
    - Sets: \\{a, b, c\\}
    - Matrices and vectors
    - Numerical values with various formats
    - Whitespace normalization

    Args:
        answer: Answer string (may contain LaTeX)

    Returns:
        Normalized answer string
    """
    if not answer:
        return ""

    import re

    # Strip outer whitespace
    answer = answer.strip()

    # Remove LaTeX display mode markers
    answer = answer.replace('$', '')
    answer = answer.replace('\\[', '').replace('\\]', '')
    answer = answer.replace('\\(', '').replace('\\)', '')

    # Normalize whitespace around common delimiters
    answer = re.sub(r'\s*,\s*', ',', answer)  # "a, b" → "a,b"
    answer = re.sub(r'\s*=\s*', '=', answer)  # "x = 5" → "x=5"
    answer = re.sub(r'\s+', ' ', answer)      # Multiple spaces → single space

    # Normalize LaTeX fractions: \frac{a}{b} → frac(a,b) for comparison
    # This allows \frac{1}{2} to match \frac{2}{4} after simplification
    def normalize_frac(match):
        num = match.group(1).strip()
        den = match.group(2).strip()
        return f"frac({num},{den})"
    answer = re.sub(r'\\frac\{([^}]+)\}\{([^}]+)\}', normalize_frac, answer)

    # Normalize common LaTeX commands (preserve but standardize)
    latex_commands = {
        '\\sqrt': 'sqrt',
        '\\pi': 'pi',
        '\\infty': 'inf',
        '\\in': ' in ',
        '\\cup': ' cup ',
        '\\cap': ' cap ',
        '\\subset': ' subset ',
        '\\subseteq': ' subseteq ',
        '\\emptyset': 'emptyset',
        '\\varnothing': 'emptyset',
        '\\left': '',
        '\\right': '',
        # Note: \\{ and \\} handled separately below for clarity
        '\\ldots': '...',
        '\\cdots': '...',
        '\\le': '<=',
        '\\ge': '>=',
        '\\leq': '<=',
        '\\geq': '>=',
        '\\ne': '!=',
        '\\neq': '!=',
        '\\times': '*',
        '\\cdot': '*',
        '\\div': '/',
    }

    for latex_cmd, replacement in latex_commands.items():
        answer = answer.replace(latex_cmd, replacement)

    # Handle escaped braces separately (avoid duplicate with dict)
    answer = answer.replace('\\{', '{')
    answer = answer.replace('\\}', '}')

    # Normalize interval notation: remove extra spaces inside brackets
    answer = re.sub(r'\[\s*', '[', answer)
    answer = re.sub(r'\s*\]', ']', answer)
    answer = re.sub(r'\(\s*', '(', answer)
    answer = re.sub(r'\s*\)', ')', answer)

    # Try to simplify fractions if both numerator and denominator are integers
    def simplify_frac(match):
        import math
        try:
            num = int(match.group(1))
            den = int(match.group(2))
            gcd = math.gcd(abs(num), abs(den))
            return f"frac({num//gcd},{den//gcd})"
        except:
            return match.group(0)
    answer = re.sub(r'frac\((-?\d+),(-?\d+)\)', simplify_frac, answer)

    # Normalize numbers: remove commas, standardize decimals
    def normalize_number(match):
        num_str = match.group(0).replace(',', '')
        try:
            num = float(num_str)
            if num.is_integer():
                return str(int(num))
            else:
                # Round to 6 decimal places to avoid floating point issues
                return f"{num:.6f}".rstrip('0').rstrip('.')
        except:
            return match.group(0)

    answer = re.sub(r'-?\d+(?:,\d{3})*(?:\.\d+)?', normalize_number, answer)

    # Final cleanup
    answer = answer.strip()

    return answer


def canonical_non_numeric(ans: str) -> Optional[str]:
    """Map various verbal answers to a canonical label

    Handles non-numeric MATH answers like:
    - "no solution"
    - "does not exist"
    - "infinitely many solutions"
    - "all real numbers"

    Args:
        ans: Answer string

    Returns:
        Canonical label if recognized, None otherwise
    """
    import re

    a = ans.strip().lower()
    # Strip punctuation
    a = re.sub(r'[.,;:!?\s]+', ' ', a).strip()

    # Common patterns
    if any(kw in a for kw in ["no solution", "no real solution", "no real solutions", "no solutions"]):
        return "NO_SOLUTION"

    if any(kw in a for kw in ["does not exist", "dne", "undefined"]):
        return "DNE"

    if any(kw in a for kw in ["infinitely many", "infinite number of solutions", "infinite solutions"]):
        return "INFINITELY_MANY"

    if any(kw in a for kw in ["all real numbers", "any real number", "for all real x", "all reals"]):
        return "ALL_REALS"

    return None


def math_answers_match(answer1: str, answer2: str) -> bool:
    """Check if two MATH dataset answers match after normalization

    Uses multiple strategies:
    1. Exact string match after normalization
    2. Numeric equality for pure numbers
    3. SymPy-based symbolic equivalence (algebraic, fraction, etc.)
    4. Whitespace-insensitive string comparison

    Args:
        answer1: First answer
        answer2: Second answer

    Returns:
        True if answers match after normalization
    """
    if not answer1 or not answer2:
        return False

    import re

    norm1 = math_normalize_answer(answer1)
    norm2 = math_normalize_answer(answer2)

    # 0. Non-numeric canonical forms (check before other strategies)
    # Handles "no solution", "DNE", "infinitely many", etc.
    canon1 = canonical_non_numeric(norm1)
    canon2 = canonical_non_numeric(norm2)
    if canon1 is not None or canon2 is not None:
        return canon1 == canon2

    # 1. Direct string match after normalization
    if norm1 == norm2:
        return True

    # 2. Try to evaluate as numbers if possible
    try:
        # Extract just numbers for comparison (handles cases like "4" vs "4.0")
        num1_match = re.search(r'^-?\d+(?:\.\d+)?$', norm1)
        num2_match = re.search(r'^-?\d+(?:\.\d+)?$', norm2)

        if num1_match and num2_match:
            num1 = float(norm1)
            num2 = float(norm2)
            return abs(num1 - num2) < 1e-6
    except:
        pass

    # 3. Try SymPy-based symbolic equivalence
    # This handles algebraic equivalence, fraction simplification, etc.
    try:
        import sympy as sp

        def try_sympy_parse(expr: str) -> Optional[sp.Expr]:
            """Attempt to parse expression as SymPy object"""
            try:
                expr_clean = expr

                # Convert frac(a,b) back to a/b
                expr_clean = re.sub(r'frac\(([^,]+),([^)]+)\)', r'(\1)/(\2)', expr_clean)

                # Basic cleanup: remove stray spaces around operators
                expr_clean = re.sub(r'\s+', ' ', expr_clean)

                # Locals mapping: let sympify know about sqrt, pi, etc.
                # Don't inject sp. into the string; pass locals dict instead
                locals_dict = {
                    "sqrt": sp.sqrt,
                    "pi": sp.pi,
                    "inf": sp.oo,
                    "e": sp.E,
                }

                return sp.sympify(expr_clean, locals=locals_dict, rational=True)
            except Exception:
                return None

        expr1 = try_sympy_parse(norm1)
        expr2 = try_sympy_parse(norm2)

        if expr1 is not None and expr2 is not None:
            try:
                # Check symbolic equality
                diff = sp.simplify(expr1 - expr2)
                if diff == 0:
                    return True

                # Also try expanding and simplifying both sides
                if sp.simplify(expr1) == sp.simplify(expr2):
                    return True

                # For fractions, check if they're equivalent
                if isinstance(expr1, sp.Rational) and isinstance(expr2, sp.Rational):
                    return expr1 == expr2

            except Exception:
                pass

    except ImportError:
        # SymPy not available, skip symbolic checking
        pass

    # 4. Check if they differ only in equivalent representations
    # E.g., "x in [-2,7]" vs "x in [ -2 , 7 ]"
    compact1 = re.sub(r'\s+', '', norm1).lower()
    compact2 = re.sub(r'\s+', '', norm2).lower()

    return compact1 == compact2


def evaluate_math_answer(
    predicted: str,
    ground_truth: str,
) -> bool:
    """Evaluate if predicted MATH answer matches ground truth

    Uses MATH-specific normalization that handles:
    - LaTeX expressions
    - Fractions and simplification
    - Intervals and sets
    - Multiple equivalent representations

    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer

    Returns:
        True if answers match
    """
    if not predicted or not ground_truth:
        return False

    return math_answers_match(predicted, ground_truth)


def format_prompt(
    question: str,
    template: str = "default",
    few_shot_examples: Optional[List[Dict[str, str]]] = None,
) -> str:
    """Format prompt for model input

    Args:
        question: Input question
        template: Prompt template to use
        few_shot_examples: Optional few-shot examples

    Returns:
        Formatted prompt
    """
    if template == "default":
        prompt = f"Question: {question}\n\nAnswer:"
    elif template == "cot":
        # Structured prompt with clear instructions for GSM8K format
        prompt = f"""You are a helpful assistant that solves problems step by step with each step signified by "Step [step_number]: ".
Always provide your final answer after #### at the end.

Question: {question}

Please solve this step by step, putting each step after "Step [step_number]: " and always provide your final answer after ####.

Solution:

"""
    elif template == "cot_final":
        # Chain of thought with "Final Answer:" marker
        prompt = f"""You are a helpful assistant that solves problems step by step with each step signified by "Step [step_number]: ".
Always provide your final answer after "Final Answer:" at the end.

Question: {question}

Please solve this step by step, putting each step after "Step [step_number]: " and always provide your final answer after "Final Answer:".

Solution:

"""
    elif template == "math_cot":
        # Chain of thought for MATH dataset with \boxed{} formatting
        prompt = f"""You are a helpful assistant that solves olympiad-style math problems step by step.
At the end, ALWAYS put your final answer in LaTeX form inside \\boxed{{...}} on its own line.

Question: {question}

Please solve this step by step, and put ONLY the final result (no explanation) inside \\boxed{{...}} on the last line.

Solution:

"""
    elif template == "direct":
        prompt = question
    else:
        prompt = question

    # Add few-shot examples if provided
    if few_shot_examples:
        examples_text = ""
        for ex in few_shot_examples:
            examples_text += f"Question: {ex['question']}\nAnswer: {ex['answer']}\n\n"
        prompt = examples_text + prompt

    return prompt


def calculate_metrics(
    predictions: List[Optional[str]],
    ground_truths: List[Optional[str]],
    task: str = "gsm8k",
) -> Dict[str, float]:
    """Calculate evaluation metrics with extraction tracking

    Distinguishes between extraction failures and reasoning errors:
    - accuracy: standard accuracy over all prompts (correct / total)
    - conditional_accuracy: accuracy given extraction succeeded (correct / extractable)
    - extraction_rate: fraction of prompts with extractable answers

    This allows distinguishing formatting quality from reasoning quality.

    Args:
        predictions: List of predicted answers (None if extraction failed)
        ground_truths: List of ground truth answers
        task: Task format ("gsm8k" or "math" / "math-500")

    Returns:
        Dictionary of metrics including accuracy, conditional_accuracy, extraction_rate
    """
    if len(predictions) != len(ground_truths):
        raise ValueError("Number of predictions and ground truths must match")

    total_prompts = len(predictions)
    valid_total = 0
    n_extractable = 0
    n_correct = 0

    for pred, gt in zip(predictions, ground_truths):
        # Skip if ground truth is None (bad gold answer)
        if gt is None:
            continue

        valid_total += 1

        # Track extraction success
        if pred is None:
            continue

        n_extractable += 1

        # Dispatch to appropriate evaluation function based on task
        if task in ("math", "math-500"):
            is_correct = evaluate_math_answer(pred, gt)
        else:
            is_correct = evaluate_answer(pred, gt)

        if is_correct:
            n_correct += 1

    overall_acc = n_correct / valid_total if valid_total > 0 else 0.0
    cond_acc = n_correct / n_extractable if n_extractable > 0 else 0.0
    extract_rate = n_extractable / valid_total if valid_total > 0 else 0.0

    return {
        "accuracy": overall_acc,                # correct / valid_total
        "conditional_accuracy": cond_acc,       # correct / extractable
        "extraction_rate": extract_rate,        # extractable / valid_total
        "correct": n_correct,
        "extractable": n_extractable,
        "total": valid_total,                   # valid examples (excludes None golds)
        "total_prompts": total_prompts,         # all prompts including invalid
    }


def print_metrics(metrics: Dict[str, Any], title: str = "Results") -> None:
    """Pretty print metrics

    Args:
        metrics: Dictionary of metrics
        title: Title for output
    """
    print(f"\n{'='*50}")
    print(f"{title:^50}")
    print(f"{'='*50}")

    for key, value in metrics.items():
        if isinstance(value, float):
            print(f"{key:.<30} {value:.4f}")
        else:
            print(f"{key:.<30} {value}")

    print(f"{'='*50}\n")


def create_directory_structure(base_dir: Optional[str] = None) -> None:
    """Create standard directory structure

    Args:
        base_dir: Base directory (defaults to current directory)
    """
    if base_dir is None:
        base_dir = Path.cwd()
    else:
        base_dir = Path(base_dir)

    directories = [
        "data",
        "data/gsm8k",
        "models",
        "models/cache",
        "output",
        "output/results",
        "output/trajectories",
        "output/checkpoints",
        "output/logs",
        "output/cache",
        "config",
        "src",
        "examples",
    ]

    for dir_name in directories:
        dir_path = base_dir / dir_name
        dir_path.mkdir(parents=True, exist_ok=True)

    print(f"Created directory structure at {base_dir}")