burnmydays's picture
Deploy harness v2 to root for HuggingFace Space
19d2058
"""
compression.py — Pluggable Compression Backends
The conservation law doesn't depend on WHICH compressor is used.
The compressor is the channel. The law is about what survives the channel.
Three backends:
- 'extractive': Deterministic sentence ranking (no model, fast, for testing)
- 'bart': facebook/bart-large-cnn or distilbart (for HuggingFace Space)
- 'api': External LLM via API (GPT-4, Claude, etc.)
All backends implement the same interface:
compress(text: str, target_ratio: float) -> str
"""
import re
from typing import Optional
from abc import ABC, abstractmethod
class CompressionBackend(ABC):
"""Abstract compression backend."""
@abstractmethod
def compress(self, text: str, target_ratio: float = 0.5) -> str:
"""
Compress text to approximately target_ratio of original length.
target_ratio: float in (0, 1), e.g. 0.5 = compress to half length.
Returns compressed text.
"""
pass
@property
@abstractmethod
def name(self) -> str:
pass
class ExtractiveBackend(CompressionBackend):
"""
Deterministic extractive compression. No model required.
Ranks sentences by information density (unique content words / total words)
and returns the top-k sentences that fit within the target length.
This is NOT a good compressor. It's a PREDICTABLE compressor.
That's the point: we can verify the pipeline works before adding
stochastic models.
"""
@property
def name(self) -> str:
return 'extractive'
def compress(self, text: str, target_ratio: float = 0.5) -> str:
sentences = self._split_sentences(text)
if len(sentences) <= 1:
return text
target_len = max(1, int(len(text.split()) * target_ratio))
# Score each sentence by information density
scored = []
for i, sent in enumerate(sentences):
words = sent.lower().split()
if not words:
continue
unique = len(set(words))
density = unique / len(words)
# Boost sentences with modal operators (commitment-bearing)
has_modal = any(m in sent.lower() for m in
['must', 'shall', 'cannot', 'required', 'always', 'never'])
score = density + (0.5 if has_modal else 0.0)
scored.append((score, i, sent))
# Sort by score descending, then take enough to fill target
scored.sort(key=lambda x: -x[0])
selected = []
word_count = 0
for score, idx, sent in scored:
sent_words = len(sent.split())
if word_count + sent_words <= target_len or not selected:
selected.append((idx, sent))
word_count += sent_words
if word_count >= target_len:
break
# Restore original order
selected.sort(key=lambda x: x[0])
return ' '.join(sent for _, sent in selected)
def _split_sentences(self, text: str):
"""Split on sentence boundaries and semicolons."""
parts = re.split(r'(?<=[.!?;])\s+', text)
return [p.strip() for p in parts if p.strip()]
class BartBackend(CompressionBackend):
"""
BART-based abstractive compression.
Lazy-loads model on first use.
"""
def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6"):
self._model_name = model_name
self._summarizer = None
@property
def name(self) -> str:
return f'bart:{self._model_name}'
def _load(self):
if self._summarizer is None:
from transformers import pipeline
self._summarizer = pipeline(
"summarization",
model=self._model_name,
device=-1 # CPU
)
def compress(self, text: str, target_ratio: float = 0.5) -> str:
self._load()
# Estimate target max_length in tokens (~1.3 tokens per word)
word_count = len(text.split())
max_length = max(10, int(word_count * target_ratio * 1.3))
min_length = max(5, max_length // 4)
try:
result = self._summarizer(
text,
max_length=max_length,
min_length=min_length,
do_sample=False
)
return result[0]['summary_text']
except Exception as e:
# If text is too short for summarization, return as-is
return text
class BackTranslationBackend(CompressionBackend):
"""
Paraphrase via back-translation (en→de→en).
This is a TRANSFORMATION, not compression per se,
but it's the second stress in the dual-stress regime.
"""
def __init__(self):
self._en_de = None
self._de_en = None
@property
def name(self) -> str:
return 'back_translation'
def _load(self):
if self._en_de is None:
from transformers import pipeline
self._en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=-1)
self._de_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en", device=-1)
def compress(self, text: str, target_ratio: float = 0.5) -> str:
"""Back-translate. target_ratio is ignored (paraphrase preserves length)."""
self._load()
de = self._en_de(text, max_length=512, do_sample=False)[0]['translation_text']
en = self._de_en(de, max_length=512, do_sample=False)[0]['translation_text']
return en
# ---------------------------------------------------------------------------
# Factory
# ---------------------------------------------------------------------------
_BACKENDS = {
'extractive': ExtractiveBackend,
'bart': BartBackend,
'back_translation': BackTranslationBackend,
}
def get_backend(name: str = 'extractive', **kwargs) -> CompressionBackend:
"""Get a compression backend by name."""
# Lazy import lossy backends to avoid circular imports
if name in ('lossy', 'lossy_enforced'):
from .lossy import LossyBackend, LossyEnforcedBackend
if name == 'lossy':
return LossyBackend(**kwargs)
return LossyEnforcedBackend(**kwargs)
if name not in _BACKENDS:
raise ValueError(f"Unknown backend '{name}'. Available: {list(_BACKENDS.keys()) + ['lossy', 'lossy_enforced']}")
return _BACKENDS[name](**kwargs)