""" compression.py — Pluggable Compression Backends The conservation law doesn't depend on WHICH compressor is used. The compressor is the channel. The law is about what survives the channel. Three backends: - 'extractive': Deterministic sentence ranking (no model, fast, for testing) - 'bart': facebook/bart-large-cnn or distilbart (for HuggingFace Space) - 'api': External LLM via API (GPT-4, Claude, etc.) All backends implement the same interface: compress(text: str, target_ratio: float) -> str """ import re from typing import Optional from abc import ABC, abstractmethod class CompressionBackend(ABC): """Abstract compression backend.""" @abstractmethod def compress(self, text: str, target_ratio: float = 0.5) -> str: """ Compress text to approximately target_ratio of original length. target_ratio: float in (0, 1), e.g. 0.5 = compress to half length. Returns compressed text. """ pass @property @abstractmethod def name(self) -> str: pass class ExtractiveBackend(CompressionBackend): """ Deterministic extractive compression. No model required. Ranks sentences by information density (unique content words / total words) and returns the top-k sentences that fit within the target length. This is NOT a good compressor. It's a PREDICTABLE compressor. That's the point: we can verify the pipeline works before adding stochastic models. """ @property def name(self) -> str: return 'extractive' def compress(self, text: str, target_ratio: float = 0.5) -> str: sentences = self._split_sentences(text) if len(sentences) <= 1: return text target_len = max(1, int(len(text.split()) * target_ratio)) # Score each sentence by information density scored = [] for i, sent in enumerate(sentences): words = sent.lower().split() if not words: continue unique = len(set(words)) density = unique / len(words) # Boost sentences with modal operators (commitment-bearing) has_modal = any(m in sent.lower() for m in ['must', 'shall', 'cannot', 'required', 'always', 'never']) score = density + (0.5 if has_modal else 0.0) scored.append((score, i, sent)) # Sort by score descending, then take enough to fill target scored.sort(key=lambda x: -x[0]) selected = [] word_count = 0 for score, idx, sent in scored: sent_words = len(sent.split()) if word_count + sent_words <= target_len or not selected: selected.append((idx, sent)) word_count += sent_words if word_count >= target_len: break # Restore original order selected.sort(key=lambda x: x[0]) return ' '.join(sent for _, sent in selected) def _split_sentences(self, text: str): """Split on sentence boundaries and semicolons.""" parts = re.split(r'(?<=[.!?;])\s+', text) return [p.strip() for p in parts if p.strip()] class BartBackend(CompressionBackend): """ BART-based abstractive compression. Lazy-loads model on first use. """ def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6"): self._model_name = model_name self._summarizer = None @property def name(self) -> str: return f'bart:{self._model_name}' def _load(self): if self._summarizer is None: from transformers import pipeline self._summarizer = pipeline( "summarization", model=self._model_name, device=-1 # CPU ) def compress(self, text: str, target_ratio: float = 0.5) -> str: self._load() # Estimate target max_length in tokens (~1.3 tokens per word) word_count = len(text.split()) max_length = max(10, int(word_count * target_ratio * 1.3)) min_length = max(5, max_length // 4) try: result = self._summarizer( text, max_length=max_length, min_length=min_length, do_sample=False ) return result[0]['summary_text'] except Exception as e: # If text is too short for summarization, return as-is return text class BackTranslationBackend(CompressionBackend): """ Paraphrase via back-translation (en→de→en). This is a TRANSFORMATION, not compression per se, but it's the second stress in the dual-stress regime. """ def __init__(self): self._en_de = None self._de_en = None @property def name(self) -> str: return 'back_translation' def _load(self): if self._en_de is None: from transformers import pipeline self._en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=-1) self._de_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en", device=-1) def compress(self, text: str, target_ratio: float = 0.5) -> str: """Back-translate. target_ratio is ignored (paraphrase preserves length).""" self._load() de = self._en_de(text, max_length=512, do_sample=False)[0]['translation_text'] en = self._de_en(de, max_length=512, do_sample=False)[0]['translation_text'] return en # --------------------------------------------------------------------------- # Factory # --------------------------------------------------------------------------- _BACKENDS = { 'extractive': ExtractiveBackend, 'bart': BartBackend, 'back_translation': BackTranslationBackend, } def get_backend(name: str = 'extractive', **kwargs) -> CompressionBackend: """Get a compression backend by name.""" # Lazy import lossy backends to avoid circular imports if name in ('lossy', 'lossy_enforced'): from .lossy import LossyBackend, LossyEnforcedBackend if name == 'lossy': return LossyBackend(**kwargs) return LossyEnforcedBackend(**kwargs) if name not in _BACKENDS: raise ValueError(f"Unknown backend '{name}'. Available: {list(_BACKENDS.keys()) + ['lossy', 'lossy_enforced']}") return _BACKENDS[name](**kwargs)