| """ |
| compression.py — Pluggable Compression Backends |
| |
| The conservation law doesn't depend on WHICH compressor is used. |
| The compressor is the channel. The law is about what survives the channel. |
| |
| Three backends: |
| - 'extractive': Deterministic sentence ranking (no model, fast, for testing) |
| - 'bart': facebook/bart-large-cnn or distilbart (for HuggingFace Space) |
| - 'api': External LLM via API (GPT-4, Claude, etc.) |
| |
| All backends implement the same interface: |
| compress(text: str, target_ratio: float) -> str |
| """ |
|
|
| import re |
| from typing import Optional |
| from abc import ABC, abstractmethod |
|
|
|
|
| class CompressionBackend(ABC): |
| """Abstract compression backend.""" |
| |
| @abstractmethod |
| def compress(self, text: str, target_ratio: float = 0.5) -> str: |
| """ |
| Compress text to approximately target_ratio of original length. |
| target_ratio: float in (0, 1), e.g. 0.5 = compress to half length. |
| Returns compressed text. |
| """ |
| pass |
| |
| @property |
| @abstractmethod |
| def name(self) -> str: |
| pass |
|
|
|
|
| class ExtractiveBackend(CompressionBackend): |
| """ |
| Deterministic extractive compression. No model required. |
| |
| Ranks sentences by information density (unique content words / total words) |
| and returns the top-k sentences that fit within the target length. |
| |
| This is NOT a good compressor. It's a PREDICTABLE compressor. |
| That's the point: we can verify the pipeline works before adding |
| stochastic models. |
| """ |
| |
| @property |
| def name(self) -> str: |
| return 'extractive' |
| |
| def compress(self, text: str, target_ratio: float = 0.5) -> str: |
| sentences = self._split_sentences(text) |
| if len(sentences) <= 1: |
| return text |
| |
| target_len = max(1, int(len(text.split()) * target_ratio)) |
| |
| |
| scored = [] |
| for i, sent in enumerate(sentences): |
| words = sent.lower().split() |
| if not words: |
| continue |
| unique = len(set(words)) |
| density = unique / len(words) |
| |
| has_modal = any(m in sent.lower() for m in |
| ['must', 'shall', 'cannot', 'required', 'always', 'never']) |
| score = density + (0.5 if has_modal else 0.0) |
| scored.append((score, i, sent)) |
| |
| |
| scored.sort(key=lambda x: -x[0]) |
| |
| selected = [] |
| word_count = 0 |
| for score, idx, sent in scored: |
| sent_words = len(sent.split()) |
| if word_count + sent_words <= target_len or not selected: |
| selected.append((idx, sent)) |
| word_count += sent_words |
| if word_count >= target_len: |
| break |
| |
| |
| selected.sort(key=lambda x: x[0]) |
| return ' '.join(sent for _, sent in selected) |
| |
| def _split_sentences(self, text: str): |
| """Split on sentence boundaries and semicolons.""" |
| parts = re.split(r'(?<=[.!?;])\s+', text) |
| return [p.strip() for p in parts if p.strip()] |
|
|
|
|
| class BartBackend(CompressionBackend): |
| """ |
| BART-based abstractive compression. |
| Lazy-loads model on first use. |
| """ |
| |
| def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6"): |
| self._model_name = model_name |
| self._summarizer = None |
| |
| @property |
| def name(self) -> str: |
| return f'bart:{self._model_name}' |
| |
| def _load(self): |
| if self._summarizer is None: |
| from transformers import pipeline |
| self._summarizer = pipeline( |
| "summarization", |
| model=self._model_name, |
| device=-1 |
| ) |
| |
| def compress(self, text: str, target_ratio: float = 0.5) -> str: |
| self._load() |
| |
| |
| word_count = len(text.split()) |
| max_length = max(10, int(word_count * target_ratio * 1.3)) |
| min_length = max(5, max_length // 4) |
| |
| try: |
| result = self._summarizer( |
| text, |
| max_length=max_length, |
| min_length=min_length, |
| do_sample=False |
| ) |
| return result[0]['summary_text'] |
| except Exception as e: |
| |
| return text |
|
|
|
|
| class BackTranslationBackend(CompressionBackend): |
| """ |
| Paraphrase via back-translation (en→de→en). |
| This is a TRANSFORMATION, not compression per se, |
| but it's the second stress in the dual-stress regime. |
| """ |
| |
| def __init__(self): |
| self._en_de = None |
| self._de_en = None |
| |
| @property |
| def name(self) -> str: |
| return 'back_translation' |
| |
| def _load(self): |
| if self._en_de is None: |
| from transformers import pipeline |
| self._en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=-1) |
| self._de_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en", device=-1) |
| |
| def compress(self, text: str, target_ratio: float = 0.5) -> str: |
| """Back-translate. target_ratio is ignored (paraphrase preserves length).""" |
| self._load() |
| de = self._en_de(text, max_length=512, do_sample=False)[0]['translation_text'] |
| en = self._de_en(de, max_length=512, do_sample=False)[0]['translation_text'] |
| return en |
|
|
|
|
| |
| |
| |
|
|
| _BACKENDS = { |
| 'extractive': ExtractiveBackend, |
| 'bart': BartBackend, |
| 'back_translation': BackTranslationBackend, |
| } |
|
|
| def get_backend(name: str = 'extractive', **kwargs) -> CompressionBackend: |
| """Get a compression backend by name.""" |
| |
| if name in ('lossy', 'lossy_enforced'): |
| from .lossy import LossyBackend, LossyEnforcedBackend |
| if name == 'lossy': |
| return LossyBackend(**kwargs) |
| return LossyEnforcedBackend(**kwargs) |
| |
| if name not in _BACKENDS: |
| raise ValueError(f"Unknown backend '{name}'. Available: {list(_BACKENDS.keys()) + ['lossy', 'lossy_enforced']}") |
| return _BACKENDS[name](**kwargs) |
|
|