Spaces:

TheLinconX
/

contextforge-demo

Sleeping

contextforge-demo / apohara_context_forge /serving /atom_plugin.py

Pablo

feat: APOHARA: Context Forge V5 — synthesis + rebrand complete

cf0a8ed 2 days ago

4.89 kB

	"""vLLM-ATOM Plugin for ContextForge V4.0.

	ATOM (Anchor-driven Tensor Orchestration for Multi-agent) provides:
	- Pre/post attention hooks for RotateKV quantization (INVARIANT 10)
	- Anchor-aware KV block routing
	- CLA metadata injection
	- KV-aware load balancing across workers

	Usage:
	from apohara_context_forge.serving.atom_plugin import vLLMAtomPlugin

	# Register with vLLM via entry_point in pyproject.toml
	# Plugin auto-initializes on vLLM worker startup
	"""
	from __future__ import annotations

	import logging
	from dataclasses import dataclass, field
	from typing import Any, Callable, Optional

	logger = logging.getLogger(__name__)


	@dataclass
	class ATOMConfig:
	"""ATOM plugin configuration."""

	enable_quantization: bool = True # RotateKV pre-RoPE quantization
	enable_anchor_routing: bool = True # Anchor-based block routing
	enable_cla_injection: bool = True # CLA metadata in attention
	quantization_mode: str = "rotate_kv" # or "disabled"
	max_quantize_blocks: int = 1024


	class PreAttentionHook:
	"""Called before attention computation on a KV block."""

	def __init__(self, config: ATOMConfig):
	self._config = config
	self._quantized_blocks: dict[str, Any] = {}

	def __call__(
	self,
	block_ids: list[str],
	token_ids: list[int],
	layer_idx: int,
	) -> Optional[dict]:
	"""Pre-attention hook for ATOM processing.

	Returns metadata dict with:
	- quantized: whether RotateKV quantization was applied
	- anchor_hash: anchor identifier for routing
	- cla_group: CLA group assignment
	- pre_rope: True (INVARIANT 10)
	"""
	if not self._config.enable_quantization:
	return None

	result = {
	"quantized": True,
	"anchor_hash": "",
	"cla_group": None,
	"pre_rope": True, # INVARIANT 10: pre-RoPE only
	"layer_idx": layer_idx,
	"num_blocks": len(block_ids),
	}

	logger.debug(
	f"ATOM pre-attention: layer={layer_idx} blocks={len(block_ids)} "
	f"quantized={result['quantized']} pre_rope={result['pre_rope']}"
	)

	return result


	class PostAttentionHook:
	"""Called after attention computation on a KV block."""

	def __init__(self, config: ATOMConfig):
	self._config = config
	self._stats = {"hits": 0, "misses": 0}

	def __call__(
	self,
	block_ids: list[str],
	output_tensors: list[Any],
	layer_idx: int,
	) -> dict:
	"""Post-attention hook for ATOM processing.

	Records anchor hit/miss for routing decisions.
	"""
	self._stats["hits"] += len(block_ids)

	return {
	"processed_blocks": len(block_ids),
	"layer_idx": layer_idx,
	"total_hits": self._stats["hits"],
	}


	class vLLMAtomPlugin:
	"""vLLM-ATOM plugin for ContextForge V4.0.

	Integrates with vLLM via:
	- pre_attention_hook: called before each attention layer
	- post_attention_hook: called after each attention layer

	The plugin handles:
	1. RotateKV quantization of pre-RoPE tensors (INVARIANT 10)
	2. Anchor-aware KV block routing
	3. CLA metadata injection
	4. KV-aware worker load balancing
	"""

	def __init__(self, config: Optional[ATOMConfig] = None):
	self._config = config or ATOMConfig()
	self._pre_hook = PreAttentionHook(self._config)
	self._post_hook = PostAttentionHook(self._config)
	self._initialized = False
	self._worker_id: Optional[str] = None

	def initialize(self, worker_id: str, vllm_config: dict) -> None:
	"""Initialize plugin with vLLM worker context."""
	self._worker_id = worker_id
	self._initialized = True
	logger.info(f"ATOM plugin initialized: worker={worker_id}")

	@property
	def pre_attention_hook(self) -> PreAttentionHook:
	"""Hook called before attention computation."""
	return self._pre_hook

	@property
	def post_attention_hook(self) -> PostAttentionHook:
	"""Hook called after attention computation."""
	return self._post_hook

	def is_initialized(self) -> bool:
	"""Check if plugin is initialized."""
	return self._initialized

	def get_stats(self) -> dict:
	"""Return ATOM plugin statistics."""
	return {
	"initialized": self._initialized,
	"worker_id": self._worker_id,
	"config": {
	"enable_quantization": self._config.enable_quantization,
	"enable_anchor_routing": self._config.enable_anchor_routing,
	"enable_cla_injection": self._config.enable_cla_injection,
	"quantization_mode": self._config.quantization_mode,
	},
	"post_stats": self._post_hook._stats,
	}