Deploy harness v2 to root for HuggingFace Space

19d2058 24 days ago

6.56 kB

	"""
	compression.py — Pluggable Compression Backends

	The conservation law doesn't depend on WHICH compressor is used.
	The compressor is the channel. The law is about what survives the channel.

	Three backends:
	- 'extractive': Deterministic sentence ranking (no model, fast, for testing)
	- 'bart': facebook/bart-large-cnn or distilbart (for HuggingFace Space)
	- 'api': External LLM via API (GPT-4, Claude, etc.)

	All backends implement the same interface:
	compress(text: str, target_ratio: float) -> str
	"""

	import re
	from typing import Optional
	from abc import ABC, abstractmethod


	class CompressionBackend(ABC):
	"""Abstract compression backend."""

	@abstractmethod
	def compress(self, text: str, target_ratio: float = 0.5) -> str:
	"""
	Compress text to approximately target_ratio of original length.
	target_ratio: float in (0, 1), e.g. 0.5 = compress to half length.
	Returns compressed text.
	"""
	pass

	@property
	@abstractmethod
	def name(self) -> str:
	pass


	class ExtractiveBackend(CompressionBackend):
	"""
	Deterministic extractive compression. No model required.

	Ranks sentences by information density (unique content words / total words)
	and returns the top-k sentences that fit within the target length.

	This is NOT a good compressor. It's a PREDICTABLE compressor.
	That's the point: we can verify the pipeline works before adding
	stochastic models.
	"""

	@property
	def name(self) -> str:
	return 'extractive'

	def compress(self, text: str, target_ratio: float = 0.5) -> str:
	sentences = self._split_sentences(text)
	if len(sentences) <= 1:
	return text

	target_len = max(1, int(len(text.split()) * target_ratio))

	# Score each sentence by information density
	scored = []
	for i, sent in enumerate(sentences):
	words = sent.lower().split()
	if not words:
	continue
	unique = len(set(words))
	density = unique / len(words)
	# Boost sentences with modal operators (commitment-bearing)
	has_modal = any(m in sent.lower() for m in
	['must', 'shall', 'cannot', 'required', 'always', 'never'])
	score = density + (0.5 if has_modal else 0.0)
	scored.append((score, i, sent))

	# Sort by score descending, then take enough to fill target
	scored.sort(key=lambda x: -x[0])

	selected = []
	word_count = 0
	for score, idx, sent in scored:
	sent_words = len(sent.split())
	if word_count + sent_words <= target_len or not selected:
	selected.append((idx, sent))
	word_count += sent_words
	if word_count >= target_len:
	break

	# Restore original order
	selected.sort(key=lambda x: x[0])
	return ' '.join(sent for _, sent in selected)

	def _split_sentences(self, text: str):
	"""Split on sentence boundaries and semicolons."""
	parts = re.split(r'(?<=[.!?;])\s+', text)
	return [p.strip() for p in parts if p.strip()]


	class BartBackend(CompressionBackend):
	"""
	BART-based abstractive compression.
	Lazy-loads model on first use.
	"""

	def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6"):
	self._model_name = model_name
	self._summarizer = None

	@property
	def name(self) -> str:
	return f'bart:{self._model_name}'

	def _load(self):
	if self._summarizer is None:
	from transformers import pipeline
	self._summarizer = pipeline(
	"summarization",
	model=self._model_name,
	device=-1 # CPU
	)

	def compress(self, text: str, target_ratio: float = 0.5) -> str:
	self._load()

	# Estimate target max_length in tokens (~1.3 tokens per word)
	word_count = len(text.split())
	max_length = max(10, int(word_count * target_ratio * 1.3))
	min_length = max(5, max_length // 4)

	try:
	result = self._summarizer(
	text,
	max_length=max_length,
	min_length=min_length,
	do_sample=False
	)
	return result[0]['summary_text']
	except Exception as e:
	# If text is too short for summarization, return as-is
	return text


	class BackTranslationBackend(CompressionBackend):
	"""
	Paraphrase via back-translation (en→de→en).
	This is a TRANSFORMATION, not compression per se,
	but it's the second stress in the dual-stress regime.
	"""

	def __init__(self):
	self._en_de = None
	self._de_en = None

	@property
	def name(self) -> str:
	return 'back_translation'

	def _load(self):
	if self._en_de is None:
	from transformers import pipeline
	self._en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=-1)
	self._de_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en", device=-1)

	def compress(self, text: str, target_ratio: float = 0.5) -> str:
	"""Back-translate. target_ratio is ignored (paraphrase preserves length)."""
	self._load()
	de = self._en_de(text, max_length=512, do_sample=False)[0]['translation_text']
	en = self._de_en(de, max_length=512, do_sample=False)[0]['translation_text']
	return en


	# ---------------------------------------------------------------------------
	# Factory
	# ---------------------------------------------------------------------------

	_BACKENDS = {
	'extractive': ExtractiveBackend,
	'bart': BartBackend,
	'back_translation': BackTranslationBackend,
	}

	def get_backend(name: str = 'extractive', **kwargs) -> CompressionBackend:
	"""Get a compression backend by name."""
	# Lazy import lossy backends to avoid circular imports
	if name in ('lossy', 'lossy_enforced'):
	from .lossy import LossyBackend, LossyEnforcedBackend
	if name == 'lossy':
	return LossyBackend(**kwargs)
	return LossyEnforcedBackend(**kwargs)

	if name not in _BACKENDS:
	raise ValueError(f"Unknown backend '{name}'. Available: {list(_BACKENDS.keys()) + ['lossy', 'lossy_enforced']}")
	return _BACKENDS[name](**kwargs)