File size: 6,561 Bytes

19d2058

"""
compression.py — Pluggable Compression Backends

The conservation law doesn't depend on WHICH compressor is used.
The compressor is the channel. The law is about what survives the channel.

Three backends:
  - 'extractive': Deterministic sentence ranking (no model, fast, for testing)
  - 'bart': facebook/bart-large-cnn or distilbart (for HuggingFace Space)
  - 'api': External LLM via API (GPT-4, Claude, etc.)

All backends implement the same interface:
    compress(text: str, target_ratio: float) -> str
"""

import re
from typing import Optional
from abc import ABC, abstractmethod


class CompressionBackend(ABC):
    """Abstract compression backend."""
    
    @abstractmethod
    def compress(self, text: str, target_ratio: float = 0.5) -> str:
        """
        Compress text to approximately target_ratio of original length.
        target_ratio: float in (0, 1), e.g. 0.5 = compress to half length.
        Returns compressed text.
        """
        pass
    
    @property
    @abstractmethod
    def name(self) -> str:
        pass


class ExtractiveBackend(CompressionBackend):
    """
    Deterministic extractive compression. No model required.
    
    Ranks sentences by information density (unique content words / total words)
    and returns the top-k sentences that fit within the target length.
    
    This is NOT a good compressor. It's a PREDICTABLE compressor.
    That's the point: we can verify the pipeline works before adding
    stochastic models.
    """
    
    @property
    def name(self) -> str:
        return 'extractive'
    
    def compress(self, text: str, target_ratio: float = 0.5) -> str:
        sentences = self._split_sentences(text)
        if len(sentences) <= 1:
            return text
        
        target_len = max(1, int(len(text.split()) * target_ratio))
        
        # Score each sentence by information density
        scored = []
        for i, sent in enumerate(sentences):
            words = sent.lower().split()
            if not words:
                continue
            unique = len(set(words))
            density = unique / len(words)
            # Boost sentences with modal operators (commitment-bearing)
            has_modal = any(m in sent.lower() for m in
                          ['must', 'shall', 'cannot', 'required', 'always', 'never'])
            score = density + (0.5 if has_modal else 0.0)
            scored.append((score, i, sent))
        
        # Sort by score descending, then take enough to fill target
        scored.sort(key=lambda x: -x[0])
        
        selected = []
        word_count = 0
        for score, idx, sent in scored:
            sent_words = len(sent.split())
            if word_count + sent_words <= target_len or not selected:
                selected.append((idx, sent))
                word_count += sent_words
            if word_count >= target_len:
                break
        
        # Restore original order
        selected.sort(key=lambda x: x[0])
        return ' '.join(sent for _, sent in selected)
    
    def _split_sentences(self, text: str):
        """Split on sentence boundaries and semicolons."""
        parts = re.split(r'(?<=[.!?;])\s+', text)
        return [p.strip() for p in parts if p.strip()]


class BartBackend(CompressionBackend):
    """
    BART-based abstractive compression.
    Lazy-loads model on first use.
    """
    
    def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6"):
        self._model_name = model_name
        self._summarizer = None
    
    @property
    def name(self) -> str:
        return f'bart:{self._model_name}'
    
    def _load(self):
        if self._summarizer is None:
            from transformers import pipeline
            self._summarizer = pipeline(
                "summarization", 
                model=self._model_name,
                device=-1  # CPU
            )
    
    def compress(self, text: str, target_ratio: float = 0.5) -> str:
        self._load()
        
        # Estimate target max_length in tokens (~1.3 tokens per word)
        word_count = len(text.split())
        max_length = max(10, int(word_count * target_ratio * 1.3))
        min_length = max(5, max_length // 4)
        
        try:
            result = self._summarizer(
                text, 
                max_length=max_length, 
                min_length=min_length, 
                do_sample=False
            )
            return result[0]['summary_text']
        except Exception as e:
            # If text is too short for summarization, return as-is
            return text


class BackTranslationBackend(CompressionBackend):
    """
    Paraphrase via back-translation (en→de→en).
    This is a TRANSFORMATION, not compression per se,
    but it's the second stress in the dual-stress regime.
    """
    
    def __init__(self):
        self._en_de = None
        self._de_en = None
    
    @property
    def name(self) -> str:
        return 'back_translation'
    
    def _load(self):
        if self._en_de is None:
            from transformers import pipeline
            self._en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=-1)
            self._de_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en", device=-1)
    
    def compress(self, text: str, target_ratio: float = 0.5) -> str:
        """Back-translate. target_ratio is ignored (paraphrase preserves length)."""
        self._load()
        de = self._en_de(text, max_length=512, do_sample=False)[0]['translation_text']
        en = self._de_en(de, max_length=512, do_sample=False)[0]['translation_text']
        return en


# ---------------------------------------------------------------------------
# Factory
# ---------------------------------------------------------------------------

_BACKENDS = {
    'extractive': ExtractiveBackend,
    'bart': BartBackend,
    'back_translation': BackTranslationBackend,
}

def get_backend(name: str = 'extractive', **kwargs) -> CompressionBackend:
    """Get a compression backend by name."""
    # Lazy import lossy backends to avoid circular imports
    if name in ('lossy', 'lossy_enforced'):
        from .lossy import LossyBackend, LossyEnforcedBackend
        if name == 'lossy':
            return LossyBackend(**kwargs)
        return LossyEnforcedBackend(**kwargs)
    
    if name not in _BACKENDS:
        raise ValueError(f"Unknown backend '{name}'. Available: {list(_BACKENDS.keys()) + ['lossy', 'lossy_enforced']}")
    return _BACKENDS[name](**kwargs)