odl-training-data / registry /inference /textual_quantization.py
midah's picture
AI Training Data Deals Dashboard with automated discovery, extraction pipeline, and MCP integration
0efb0d1
"""
Textual Token Clue Quantization
Converts linguistic hints into numeric token ranges
"""
from typing import Optional, Tuple
from registry.schema import InferenceConstraint, EvidenceType
import re
def quantize_textual_hint(text: str, context: Optional[Dict] = None) -> Optional[InferenceConstraint]:
"""
Convert textual hints to token estimates
Args:
text: Text containing token hints
context: Optional context (model name, previous estimates, etc.)
Returns:
InferenceConstraint with token range
"""
text_lower = text.lower()
# Pattern matching for common phrases
patterns = [
# "several trillion tokens"
(r'several\s+trillion\s+tokens?', (1e12, 10e12)),
# "trillions of tokens"
(r'trillions?\s+of\s+tokens?', (2e12, 20e12)),
# "over X trillion tokens"
(r'over\s+(\d+\.?\d*)\s*trillion\s+tokens?', lambda m: (float(m.group(1)) * 1e12, float(m.group(1)) * 2 * 1e12)),
# "approximately X trillion"
(r'approximately\s+(\d+\.?\d*)\s*trillion', lambda m: (float(m.group(1)) * 0.8 * 1e12, float(m.group(1)) * 1.2 * 1e12)),
# "orders of magnitude more than X"
(r'orders?\s+of\s+magnitude\s+more\s+than\s+([a-z0-9\-]+)', lambda m: _infer_from_reference(m.group(1), context)),
# "trained on X tokens"
(r'trained\s+on\s+(\d+\.?\d*)\s*t\s*tokens?', lambda m: (float(m.group(1)) * 0.9 * 1e12, float(m.group(1)) * 1.1 * 1e12)),
]
for pattern, handler in patterns:
match = re.search(pattern, text_lower)
if match:
if callable(handler):
result = handler(match)
if isinstance(result, tuple):
tokens_min, tokens_max = result
else:
continue
else:
tokens_min, tokens_max = handler
return InferenceConstraint(
method="textual_quantization",
tokens_min=tokens_min,
tokens_max=tokens_max,
evidence_type=EvidenceType.E5.value,
confidence=0.4, # Lower confidence for textual hints
notes=f"Extracted from: '{match.group(0)}'"
)
return None
def _infer_from_reference(model_name: str, context: Optional[Dict]) -> Tuple[float, float]:
"""Infer tokens based on reference to another model"""
# Common model token counts (in tokens)
known_tokens = {
"llama-2": 2e12,
"llama-3": 15e12,
"gpt-3": 300e9,
"gpt-4": 13e12,
"claude-3": 3.4e12,
}
model_lower = model_name.lower()
base_tokens = known_tokens.get(model_lower)
if base_tokens:
# "orders of magnitude" = 10-100x
return (base_tokens * 10, base_tokens * 100)
return (1e12, 10e12) # Default wide range
def extract_all_textual_hints(text: str) -> List[InferenceConstraint]:
"""
Extract all textual token hints from a document
Args:
text: Full document text
Returns:
List of InferenceConstraints
"""
constraints = []
# Split into sentences
sentences = re.split(r'[.!?]\s+', text)
for sentence in sentences:
constraint = quantize_textual_hint(sentence)
if constraint:
constraints.append(constraint)
return constraints