Spaces:
Running
Running
| """ | |
| Textual Token Clue Quantization | |
| Converts linguistic hints into numeric token ranges | |
| """ | |
| from typing import Optional, Tuple | |
| from registry.schema import InferenceConstraint, EvidenceType | |
| import re | |
| def quantize_textual_hint(text: str, context: Optional[Dict] = None) -> Optional[InferenceConstraint]: | |
| """ | |
| Convert textual hints to token estimates | |
| Args: | |
| text: Text containing token hints | |
| context: Optional context (model name, previous estimates, etc.) | |
| Returns: | |
| InferenceConstraint with token range | |
| """ | |
| text_lower = text.lower() | |
| # Pattern matching for common phrases | |
| patterns = [ | |
| # "several trillion tokens" | |
| (r'several\s+trillion\s+tokens?', (1e12, 10e12)), | |
| # "trillions of tokens" | |
| (r'trillions?\s+of\s+tokens?', (2e12, 20e12)), | |
| # "over X trillion tokens" | |
| (r'over\s+(\d+\.?\d*)\s*trillion\s+tokens?', lambda m: (float(m.group(1)) * 1e12, float(m.group(1)) * 2 * 1e12)), | |
| # "approximately X trillion" | |
| (r'approximately\s+(\d+\.?\d*)\s*trillion', lambda m: (float(m.group(1)) * 0.8 * 1e12, float(m.group(1)) * 1.2 * 1e12)), | |
| # "orders of magnitude more than X" | |
| (r'orders?\s+of\s+magnitude\s+more\s+than\s+([a-z0-9\-]+)', lambda m: _infer_from_reference(m.group(1), context)), | |
| # "trained on X tokens" | |
| (r'trained\s+on\s+(\d+\.?\d*)\s*t\s*tokens?', lambda m: (float(m.group(1)) * 0.9 * 1e12, float(m.group(1)) * 1.1 * 1e12)), | |
| ] | |
| for pattern, handler in patterns: | |
| match = re.search(pattern, text_lower) | |
| if match: | |
| if callable(handler): | |
| result = handler(match) | |
| if isinstance(result, tuple): | |
| tokens_min, tokens_max = result | |
| else: | |
| continue | |
| else: | |
| tokens_min, tokens_max = handler | |
| return InferenceConstraint( | |
| method="textual_quantization", | |
| tokens_min=tokens_min, | |
| tokens_max=tokens_max, | |
| evidence_type=EvidenceType.E5.value, | |
| confidence=0.4, # Lower confidence for textual hints | |
| notes=f"Extracted from: '{match.group(0)}'" | |
| ) | |
| return None | |
| def _infer_from_reference(model_name: str, context: Optional[Dict]) -> Tuple[float, float]: | |
| """Infer tokens based on reference to another model""" | |
| # Common model token counts (in tokens) | |
| known_tokens = { | |
| "llama-2": 2e12, | |
| "llama-3": 15e12, | |
| "gpt-3": 300e9, | |
| "gpt-4": 13e12, | |
| "claude-3": 3.4e12, | |
| } | |
| model_lower = model_name.lower() | |
| base_tokens = known_tokens.get(model_lower) | |
| if base_tokens: | |
| # "orders of magnitude" = 10-100x | |
| return (base_tokens * 10, base_tokens * 100) | |
| return (1e12, 10e12) # Default wide range | |
| def extract_all_textual_hints(text: str) -> List[InferenceConstraint]: | |
| """ | |
| Extract all textual token hints from a document | |
| Args: | |
| text: Full document text | |
| Returns: | |
| List of InferenceConstraints | |
| """ | |
| constraints = [] | |
| # Split into sentences | |
| sentences = re.split(r'[.!?]\s+', text) | |
| for sentence in sentences: | |
| constraint = quantize_textual_hint(sentence) | |
| if constraint: | |
| constraints.append(constraint) | |
| return constraints | |