File size: 6,561 Bytes
19d2058 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | """
compression.py — Pluggable Compression Backends
The conservation law doesn't depend on WHICH compressor is used.
The compressor is the channel. The law is about what survives the channel.
Three backends:
- 'extractive': Deterministic sentence ranking (no model, fast, for testing)
- 'bart': facebook/bart-large-cnn or distilbart (for HuggingFace Space)
- 'api': External LLM via API (GPT-4, Claude, etc.)
All backends implement the same interface:
compress(text: str, target_ratio: float) -> str
"""
import re
from typing import Optional
from abc import ABC, abstractmethod
class CompressionBackend(ABC):
"""Abstract compression backend."""
@abstractmethod
def compress(self, text: str, target_ratio: float = 0.5) -> str:
"""
Compress text to approximately target_ratio of original length.
target_ratio: float in (0, 1), e.g. 0.5 = compress to half length.
Returns compressed text.
"""
pass
@property
@abstractmethod
def name(self) -> str:
pass
class ExtractiveBackend(CompressionBackend):
"""
Deterministic extractive compression. No model required.
Ranks sentences by information density (unique content words / total words)
and returns the top-k sentences that fit within the target length.
This is NOT a good compressor. It's a PREDICTABLE compressor.
That's the point: we can verify the pipeline works before adding
stochastic models.
"""
@property
def name(self) -> str:
return 'extractive'
def compress(self, text: str, target_ratio: float = 0.5) -> str:
sentences = self._split_sentences(text)
if len(sentences) <= 1:
return text
target_len = max(1, int(len(text.split()) * target_ratio))
# Score each sentence by information density
scored = []
for i, sent in enumerate(sentences):
words = sent.lower().split()
if not words:
continue
unique = len(set(words))
density = unique / len(words)
# Boost sentences with modal operators (commitment-bearing)
has_modal = any(m in sent.lower() for m in
['must', 'shall', 'cannot', 'required', 'always', 'never'])
score = density + (0.5 if has_modal else 0.0)
scored.append((score, i, sent))
# Sort by score descending, then take enough to fill target
scored.sort(key=lambda x: -x[0])
selected = []
word_count = 0
for score, idx, sent in scored:
sent_words = len(sent.split())
if word_count + sent_words <= target_len or not selected:
selected.append((idx, sent))
word_count += sent_words
if word_count >= target_len:
break
# Restore original order
selected.sort(key=lambda x: x[0])
return ' '.join(sent for _, sent in selected)
def _split_sentences(self, text: str):
"""Split on sentence boundaries and semicolons."""
parts = re.split(r'(?<=[.!?;])\s+', text)
return [p.strip() for p in parts if p.strip()]
class BartBackend(CompressionBackend):
"""
BART-based abstractive compression.
Lazy-loads model on first use.
"""
def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6"):
self._model_name = model_name
self._summarizer = None
@property
def name(self) -> str:
return f'bart:{self._model_name}'
def _load(self):
if self._summarizer is None:
from transformers import pipeline
self._summarizer = pipeline(
"summarization",
model=self._model_name,
device=-1 # CPU
)
def compress(self, text: str, target_ratio: float = 0.5) -> str:
self._load()
# Estimate target max_length in tokens (~1.3 tokens per word)
word_count = len(text.split())
max_length = max(10, int(word_count * target_ratio * 1.3))
min_length = max(5, max_length // 4)
try:
result = self._summarizer(
text,
max_length=max_length,
min_length=min_length,
do_sample=False
)
return result[0]['summary_text']
except Exception as e:
# If text is too short for summarization, return as-is
return text
class BackTranslationBackend(CompressionBackend):
"""
Paraphrase via back-translation (en→de→en).
This is a TRANSFORMATION, not compression per se,
but it's the second stress in the dual-stress regime.
"""
def __init__(self):
self._en_de = None
self._de_en = None
@property
def name(self) -> str:
return 'back_translation'
def _load(self):
if self._en_de is None:
from transformers import pipeline
self._en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=-1)
self._de_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en", device=-1)
def compress(self, text: str, target_ratio: float = 0.5) -> str:
"""Back-translate. target_ratio is ignored (paraphrase preserves length)."""
self._load()
de = self._en_de(text, max_length=512, do_sample=False)[0]['translation_text']
en = self._de_en(de, max_length=512, do_sample=False)[0]['translation_text']
return en
# ---------------------------------------------------------------------------
# Factory
# ---------------------------------------------------------------------------
_BACKENDS = {
'extractive': ExtractiveBackend,
'bart': BartBackend,
'back_translation': BackTranslationBackend,
}
def get_backend(name: str = 'extractive', **kwargs) -> CompressionBackend:
"""Get a compression backend by name."""
# Lazy import lossy backends to avoid circular imports
if name in ('lossy', 'lossy_enforced'):
from .lossy import LossyBackend, LossyEnforcedBackend
if name == 'lossy':
return LossyBackend(**kwargs)
return LossyEnforcedBackend(**kwargs)
if name not in _BACKENDS:
raise ValueError(f"Unknown backend '{name}'. Available: {list(_BACKENDS.keys()) + ['lossy', 'lossy_enforced']}")
return _BACKENDS[name](**kwargs)
|