Spaces:

midah
/

odl-training-data

Running

App Files Files Community

odl-training-data / registry /inference /textual_quantization.py

midah

AI Training Data Deals Dashboard with automated discovery, extraction pipeline, and MCP integration

0efb0d1 6 months ago

raw

history blame contribute delete

3.43 kB

	"""
	Textual Token Clue Quantization
	Converts linguistic hints into numeric token ranges
	"""

	from typing import Optional, Tuple
	from registry.schema import InferenceConstraint, EvidenceType
	import re


	def quantize_textual_hint(text: str, context: Optional[Dict] = None) -> Optional[InferenceConstraint]:
	"""
	Convert textual hints to token estimates

	Args:
	text: Text containing token hints
	context: Optional context (model name, previous estimates, etc.)

	Returns:
	InferenceConstraint with token range
	"""
	text_lower = text.lower()

	# Pattern matching for common phrases
	patterns = [
	# "several trillion tokens"
	(r'several\s+trillion\s+tokens?', (1e12, 10e12)),
	# "trillions of tokens"
	(r'trillions?\s+of\s+tokens?', (2e12, 20e12)),
	# "over X trillion tokens"
	(r'over\s+(\d+\.?\d)\strillion\s+tokens?', lambda m: (float(m.group(1)) * 1e12, float(m.group(1)) * 2 * 1e12)),
	# "approximately X trillion"
	(r'approximately\s+(\d+\.?\d)\strillion', lambda m: (float(m.group(1)) * 0.8 * 1e12, float(m.group(1)) * 1.2 * 1e12)),
	# "orders of magnitude more than X"
	(r'orders?\s+of\s+magnitude\s+more\s+than\s+([a-z0-9\-]+)', lambda m: _infer_from_reference(m.group(1), context)),
	# "trained on X tokens"
	(r'trained\s+on\s+(\d+\.?\d)\st\stokens?', lambda m: (float(m.group(1)) 0.9 * 1e12, float(m.group(1)) * 1.1 * 1e12)),
	]

	for pattern, handler in patterns:
	match = re.search(pattern, text_lower)
	if match:
	if callable(handler):
	result = handler(match)
	if isinstance(result, tuple):
	tokens_min, tokens_max = result
	else:
	continue
	else:
	tokens_min, tokens_max = handler

	return InferenceConstraint(
	method="textual_quantization",
	tokens_min=tokens_min,
	tokens_max=tokens_max,
	evidence_type=EvidenceType.E5.value,
	confidence=0.4, # Lower confidence for textual hints
	notes=f"Extracted from: '{match.group(0)}'"
	)

	return None


	def _infer_from_reference(model_name: str, context: Optional[Dict]) -> Tuple[float, float]:
	"""Infer tokens based on reference to another model"""
	# Common model token counts (in tokens)
	known_tokens = {
	"llama-2": 2e12,
	"llama-3": 15e12,
	"gpt-3": 300e9,
	"gpt-4": 13e12,
	"claude-3": 3.4e12,
	}

	model_lower = model_name.lower()
	base_tokens = known_tokens.get(model_lower)

	if base_tokens:
	# "orders of magnitude" = 10-100x
	return (base_tokens * 10, base_tokens * 100)

	return (1e12, 10e12) # Default wide range


	def extract_all_textual_hints(text: str) -> List[InferenceConstraint]:
	"""
	Extract all textual token hints from a document

	Args:
	text: Full document text

	Returns:
	List of InferenceConstraints
	"""
	constraints = []

	# Split into sentences
	sentences = re.split(r'[.!?]\s+', text)

	for sentence in sentences:
	constraint = quantize_textual_hint(sentence)
	if constraint:
	constraints.append(constraint)

	return constraints