"""
Textual Token Clue Quantization
Converts linguistic hints into numeric token ranges
"""

from typing import Optional, Tuple
from registry.schema import InferenceConstraint, EvidenceType
import re


def quantize_textual_hint(text: str, context: Optional[Dict] = None) -> Optional[InferenceConstraint]:
    """
    Convert textual hints to token estimates
    
    Args:
        text: Text containing token hints
        context: Optional context (model name, previous estimates, etc.)
    
    Returns:
        InferenceConstraint with token range
    """
    text_lower = text.lower()
    
    # Pattern matching for common phrases
    patterns = [
        # "several trillion tokens"
        (r'several\s+trillion\s+tokens?', (1e12, 10e12)),
        # "trillions of tokens"
        (r'trillions?\s+of\s+tokens?', (2e12, 20e12)),
        # "over X trillion tokens"
        (r'over\s+(\d+\.?\d*)\s*trillion\s+tokens?', lambda m: (float(m.group(1)) * 1e12, float(m.group(1)) * 2 * 1e12)),
        # "approximately X trillion"
        (r'approximately\s+(\d+\.?\d*)\s*trillion', lambda m: (float(m.group(1)) * 0.8 * 1e12, float(m.group(1)) * 1.2 * 1e12)),
        # "orders of magnitude more than X"
        (r'orders?\s+of\s+magnitude\s+more\s+than\s+([a-z0-9\-]+)', lambda m: _infer_from_reference(m.group(1), context)),
        # "trained on X tokens"
        (r'trained\s+on\s+(\d+\.?\d*)\s*t\s*tokens?', lambda m: (float(m.group(1)) * 0.9 * 1e12, float(m.group(1)) * 1.1 * 1e12)),
    ]
    
    for pattern, handler in patterns:
        match = re.search(pattern, text_lower)
        if match:
            if callable(handler):
                result = handler(match)
                if isinstance(result, tuple):
                    tokens_min, tokens_max = result
                else:
                    continue
            else:
                tokens_min, tokens_max = handler
            
            return InferenceConstraint(
                method="textual_quantization",
                tokens_min=tokens_min,
                tokens_max=tokens_max,
                evidence_type=EvidenceType.E5.value,
                confidence=0.4,  # Lower confidence for textual hints
                notes=f"Extracted from: '{match.group(0)}'"
            )
    
    return None


def _infer_from_reference(model_name: str, context: Optional[Dict]) -> Tuple[float, float]:
    """Infer tokens based on reference to another model"""
    # Common model token counts (in tokens)
    known_tokens = {
        "llama-2": 2e12,
        "llama-3": 15e12,
        "gpt-3": 300e9,
        "gpt-4": 13e12,
        "claude-3": 3.4e12,
    }
    
    model_lower = model_name.lower()
    base_tokens = known_tokens.get(model_lower)
    
    if base_tokens:
        # "orders of magnitude" = 10-100x
        return (base_tokens * 10, base_tokens * 100)
    
    return (1e12, 10e12)  # Default wide range


def extract_all_textual_hints(text: str) -> List[InferenceConstraint]:
    """
    Extract all textual token hints from a document
    
    Args:
        text: Full document text
    
    Returns:
        List of InferenceConstraints
    """
    constraints = []
    
    # Split into sentences
    sentences = re.split(r'[.!?]\s+', text)
    
    for sentence in sentences:
        constraint = quantize_textual_hint(sentence)
        if constraint:
            constraints.append(constraint)
    
    return constraints