| """ |
| lossy.py — Lossy Compression Backend |
| |
| Simulates what real LLMs do to text under recursive compression: |
| - Drop modal operators ("must" → removed or softened) |
| - Paraphrase (swap words for synonyms) |
| - Add conversational filler ("Got it!", "Sure thing!") |
| - Lose specific quantities ($100 → "the amount", Friday → "soon") |
| |
| This is NOT a real compressor. It's a DETERMINISTIC SIMULATION |
| of the drift patterns observed in live LLM testing (Meta Llama, |
| GPT-4, Claude — see empirical data in paper Section 6). |
| |
| Why this exists: |
| - Extractive backend is too faithful (doesn't show the gap) |
| - BART requires 2GB+ model download |
| - API backends require credentials |
| - This runs anywhere, instantly, and shows the conservation law |
| |
| The drift patterns are seeded for reproducibility. |
| Same input → same output → same lineage chain. |
| """ |
|
|
| import re |
| import random |
| import hashlib |
| from typing import List, Tuple |
|
|
| from .compression import CompressionBackend |
|
|
|
|
| |
| |
| |
|
|
| |
| MODAL_DRIFT = { |
| 'must': ['should', 'could', 'might want to', ''], |
| 'shall': ['will', 'should', 'might', ''], |
| 'cannot': ['probably shouldn\'t', 'might not want to', 'shouldn\'t', ''], |
| 'shall not': ['probably shouldn\'t', 'might want to avoid', ''], |
| 'must not': ['should avoid', 'probably shouldn\'t', ''], |
| 'required to': ['expected to', 'encouraged to', 'asked to', ''], |
| 'prohibited from': ['discouraged from', 'asked not to', ''], |
| 'forbidden to': ['discouraged from', 'asked not to', ''], |
| 'always': ['usually', 'often', 'typically', 'generally'], |
| 'never': ['rarely', 'seldom', 'not usually', 'typically don\'t'], |
| } |
|
|
| |
| QUANTITY_DRIFT = [ |
| (re.compile(r'\$\d[\d,]*'), ['the payment', 'the amount', 'the fee']), |
| (re.compile(r'\b\d+\s*(?:days?|hours?|minutes?|months?|years?|weeks?)\b', re.I), |
| ['the timeframe', 'the period', 'a while']), |
| (re.compile(r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', re.I), |
| ['soon', 'by the deadline', 'on time']), |
| (re.compile(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?\b', re.I), |
| ['by the deadline', 'on time', 'as scheduled']), |
| (re.compile(r'\b\d{1,3}(?:,\d{3})*\b'), ['several', 'many', 'a number of']), |
| ] |
|
|
| |
| FILLER = [ |
| "Got it. ", |
| "Sure thing. ", |
| "Understood. ", |
| "Makes sense. ", |
| "Right. ", |
| "OK so ", |
| "Basically, ", |
| "In other words, ", |
| "To summarize, ", |
| "The key point is ", |
| ] |
|
|
| |
| PADDING = [ |
| " That's important to keep in mind.", |
| " Just wanted to make sure that's clear.", |
| " Let me know if you have questions.", |
| " Hope that helps!", |
| " Pretty straightforward.", |
| " Nothing too complicated here.", |
| ] |
|
|
|
|
| class LossyBackend(CompressionBackend): |
| """ |
| Deterministic lossy compression simulating real LLM drift. |
| |
| Drift intensity increases with each call (simulating recursive |
| degradation). The seed is derived from input text hash, so |
| same input always produces same output. |
| |
| Parameters: |
| drift_rate: 0.0 (no drift) to 1.0 (maximum drift) |
| Controls probability of each drift operation. |
| add_filler: Whether to add conversational filler |
| iteration: Current recursion depth (increases drift) |
| """ |
| |
| def __init__(self, drift_rate: float = 0.4, add_filler: bool = True): |
| self._drift_rate = drift_rate |
| self._add_filler = add_filler |
| self._call_count = 0 |
| |
| @property |
| def name(self) -> str: |
| return f'lossy(drift={self._drift_rate})' |
| |
| def reset(self): |
| """Reset call counter (for new signal).""" |
| self._call_count = 0 |
| |
| def compress(self, text: str, target_ratio: float = 0.5) -> str: |
| """ |
| Apply lossy transformation to text. |
| |
| Drift increases with each call (self._call_count). |
| """ |
| self._call_count += 1 |
| |
| |
| seed = int(hashlib.md5(text.encode()).hexdigest()[:8], 16) + self._call_count |
| rng = random.Random(seed) |
| |
| |
| effective_rate = min(1.0, self._drift_rate * (1.0 + 0.2 * self._call_count)) |
| |
| result = text |
| |
| |
| result = self._soften_modals(result, rng, effective_rate) |
| |
| |
| result = self._erode_quantities(result, rng, effective_rate * 0.7) |
| |
| |
| result = self._drop_sentences(result, rng, target_ratio) |
| |
| |
| if self._add_filler and rng.random() < effective_rate * 0.5: |
| result = self._add_conversational_filler(result, rng) |
| |
| return result.strip() |
| |
| def _soften_modals(self, text: str, rng: random.Random, rate: float) -> str: |
| """Replace strong modals with weaker alternatives.""" |
| result = text |
| |
| for modal in sorted(MODAL_DRIFT.keys(), key=len, reverse=True): |
| if rng.random() < rate: |
| replacements = MODAL_DRIFT[modal] |
| replacement = rng.choice(replacements) |
| |
| pattern = re.compile(re.escape(modal), re.I) |
| match = pattern.search(result) |
| if match: |
| original = match.group() |
| |
| if original[0].isupper() and replacement: |
| replacement = replacement[0].upper() + replacement[1:] |
| result = result[:match.start()] + replacement + result[match.end():] |
| return result |
| |
| def _erode_quantities(self, text: str, rng: random.Random, rate: float) -> str: |
| """Replace specific quantities with vague references.""" |
| result = text |
| for pattern, replacements in QUANTITY_DRIFT: |
| if rng.random() < rate: |
| match = pattern.search(result) |
| if match: |
| replacement = rng.choice(replacements) |
| result = result[:match.start()] + replacement + result[match.end():] |
| return result |
| |
| def _drop_sentences(self, text: str, rng: random.Random, target_ratio: float) -> str: |
| """Drop sentences to approximate target compression ratio.""" |
| sentences = re.split(r'(?<=[.!?])\s+', text) |
| if len(sentences) <= 1: |
| return text |
| |
| target_count = max(1, int(len(sentences) * target_ratio)) |
| |
| if len(sentences) <= target_count: |
| return text |
| |
| |
| scored = [] |
| for i, sent in enumerate(sentences): |
| has_modal = any(m in sent.lower() for m in ['must', 'shall', 'cannot', 'required', 'always', 'never']) |
| |
| |
| score = rng.random() |
| scored.append((score, i, sent)) |
| |
| scored.sort(key=lambda x: -x[0]) |
| kept = scored[:target_count] |
| kept.sort(key=lambda x: x[1]) |
| |
| return ' '.join(sent for _, _, sent in kept) |
| |
| def _add_conversational_filler(self, text: str, rng: random.Random) -> str: |
| """Add LLM-style conversational filler.""" |
| filler = rng.choice(FILLER) |
| padding = rng.choice(PADDING) if rng.random() < 0.3 else '' |
| return filler + text + padding |
|
|
|
|
| class LossyEnforcedBackend(CompressionBackend): |
| """ |
| Lossy backend that PRESERVES modal-bearing sentences during dropping. |
| |
| This simulates what happens when a compressor is commitment-aware: |
| same drift patterns, but modal sentences get priority during selection. |
| |
| The enforcement is in the SELECTION, not post-hoc injection. |
| """ |
| |
| def __init__(self, drift_rate: float = 0.4, add_filler: bool = False): |
| self._drift_rate = drift_rate |
| self._add_filler = add_filler |
| self._call_count = 0 |
| |
| @property |
| def name(self) -> str: |
| return f'lossy_enforced(drift={self._drift_rate})' |
| |
| def reset(self): |
| self._call_count = 0 |
| |
| def compress(self, text: str, target_ratio: float = 0.5) -> str: |
| self._call_count += 1 |
| seed = int(hashlib.md5(text.encode()).hexdigest()[:8], 16) + self._call_count |
| rng = random.Random(seed) |
| |
| result = text |
| |
| |
| |
| |
| |
| |
| |
| result = self._priority_drop(result, rng, target_ratio) |
| |
| return result.strip() |
| |
| def _mild_soften(self, text: str, rng: random.Random, rate: float) -> str: |
| """Much lower drift rate for modals under enforcement.""" |
| result = text |
| for modal in sorted(MODAL_DRIFT.keys(), key=len, reverse=True): |
| if rng.random() < rate: |
| replacements = [r for r in MODAL_DRIFT[modal] if r] |
| if replacements: |
| replacement = rng.choice(replacements) |
| pattern = re.compile(re.escape(modal), re.I) |
| match = pattern.search(result) |
| if match: |
| original = match.group() |
| if original[0].isupper() and replacement: |
| replacement = replacement[0].upper() + replacement[1:] |
| result = result[:match.start()] + replacement + result[match.end():] |
| return result |
| |
| def _mild_erode(self, text: str, rng: random.Random, rate: float) -> str: |
| """Lower erosion rate under enforcement.""" |
| result = text |
| for pattern, replacements in QUANTITY_DRIFT: |
| if rng.random() < rate: |
| match = pattern.search(result) |
| if match: |
| replacement = rng.choice(replacements) |
| result = result[:match.start()] + replacement + result[match.end():] |
| return result |
| |
| def _priority_drop(self, text: str, rng: random.Random, target_ratio: float) -> str: |
| """Drop sentences but PRIORITIZE modal-bearing ones.""" |
| sentences = re.split(r'(?<=[.!?])\s+', text) |
| if len(sentences) <= 1: |
| return text |
| |
| target_count = max(1, int(len(sentences) * target_ratio)) |
| if len(sentences) <= target_count: |
| return text |
| |
| scored = [] |
| for i, sent in enumerate(sentences): |
| has_modal = any(m in sent.lower() for m in |
| ['must', 'shall', 'cannot', 'required', 'always', 'never', |
| 'should', 'could', 'might', 'expected', 'encouraged']) |
| |
| score = (1.0 if has_modal else 0.0) + rng.random() * 0.5 |
| scored.append((score, i, sent)) |
| |
| scored.sort(key=lambda x: -x[0]) |
| kept = scored[:target_count] |
| kept.sort(key=lambda x: x[1]) |
| |
| return ' '.join(sent for _, _, sent in kept) |
|
|