Spaces:

burnmydays
/

commitment-conservation-demo

Sleeping

App Files Files Community

burnmydays commited on Jan 18

Commit

ed31594

1 Parent(s): 56ffdb9

Remove spacy dependency completely - use regex sentence splitting

Browse files

Files changed (3) hide show

harness/src/extraction.py +9 -36
harness/src/test_harness.py +93 -182
requirements.txt +0 -2

harness/src/extraction.py CHANGED Viewed

@@ -1,48 +1,21 @@
-from spacy import load
 import re
-def load_spacy_model(model_name='en_core_web_sm'):
-    nlp = load(model_name)
-    return nlp
 def normalize_text(text):
     """Normalize text for comparison: lowercase, strip punctuation."""
     return re.sub(r'[^\w\s]', '', text.lower().strip())
 def extract_hard_commitments(text, nlp=None):
     """Extract commitments using expanded modal keyword detection."""
-    if nlp is None:
-        nlp = load_spacy_model()
-    doc = nlp(text)
     commitments = set()
-    # Expanded modal keywords
     hard_modals = {'must', 'shall', 'will', 'have', 'need', 'required', 'ought', 'cannot', 'should'}
-    soft_modals = {'might', 'could', 'may', 'perhaps', 'maybe', 'tend'}
-    # Extract by sentence-level modal presence
-    for sent in doc.sents:
-        sent_lower = sent.text.lower()
-        # Check for hard modals
         if any(modal in sent_lower for modal in hard_modals):
-            commitments.add(sent.text.strip())
-        # Check for soft modals
-        elif any(modal in sent_lower for modal in soft_modals):
-            commitments.add(sent.text.strip())
     return commitments
-def extract_from_texts(texts, model_name='en_core_web_sm'):
-    nlp = load_spacy_model(model_name)
-    all_commitments = {}
-    for text in texts:
-        commitments = extract_hard_commitments(text, nlp)
-        all_commitments[text] = commitments
-    return all_commitments
-def extract_hard(text: str, nlp=None) -> set:
-    """Shorthand for extract_hard_commitments."""
-    return extract_hard_commitments(text, nlp)

 import re
 def normalize_text(text):
     """Normalize text for comparison: lowercase, strip punctuation."""
     return re.sub(r'[^\w\s]', '', text.lower().strip())
+def simple_sent_split(text):
+    """Simple sentence splitter using regex"""
+    sentences = re.split(r'[.!?]+\s+|[.!?]+$', text)
+    return [s.strip() for s in sentences if s.strip()]
 def extract_hard_commitments(text, nlp=None):
     """Extract commitments using expanded modal keyword detection."""
     commitments = set()
     hard_modals = {'must', 'shall', 'will', 'have', 'need', 'required', 'ought', 'cannot', 'should'}
+    sentences = simple_sent_split(text)
+    for sent in sentences:
+        sent_lower = sent.lower()
         if any(modal in sent_lower for modal in hard_modals):
+            commitments.add(sent.strip())
     return commitments

harness/src/test_harness.py CHANGED Viewed

@@ -1,13 +1,10 @@
 # Minimal Python Test Harness for Commitment Conservation Protocol
 # This script implements the falsification protocol from Section 3 of the preprint.
-# It applies transformations (T_i), extracts hard commitments, computes Jaccard fidelity/drift, and plots results.
-# Requires: transformers, spacy, matplotlib, numpy
-# Run: python test_harness.py
 import os
 import json
 from transformers import pipeline
-import spacy
 import matplotlib.pyplot as plt
 from typing import List, Set
 import numpy as np
@@ -15,8 +12,6 @@ from datetime import datetime
 from .extraction import extract_hard_commitments
 from .metrics import jaccard, hybrid_fidelity
-# Load models
-nlp = spacy.load("en_core_web_sm")
 # Use lighter distilbart model for more faithful extraction-based summarization
 summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 translator_en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de")
@@ -29,192 +24,108 @@ SAMPLE_SIGNALS = [
     "You must pay $100 by Friday if the deal closes; it's likely rainy, so plan accordingly.",
     "This function must return an integer.",
     "Always verify the user's age before proceeding.",
-    "You must do this task immediately.",  # Simpler, direct commitment
-    # "Your custom text with commitments here."
 ]
-def extract_hard_commitments(text: str) -> Set[str]:
-    """Extract hard commitments using rule-based spaCy parsing."""
-    doc = nlp(text)
-    commitments = set()
-    for sent in doc.sents:
-        # Split on semicolons to handle multiple clauses in one sentence
-        clauses = [c.strip() for c in sent.text.split(';')]
-        for clause in clauses:
-            clause_lower = clause.lower()
-            if any(modal in clause_lower for modal in ["must", "shall", "cannot", "required"]):
-                # Normalize: strip trailing punctuation, extra spaces
-                normalized = clause.strip().rstrip('.!?').strip()
-                commitments.add(normalized)
-    return commitments
-def apply_transformations(signal: str) -> List[str]:
-    """Apply k=3 transformations: summarization, paraphrase (back-translation), abstraction."""
-    # Summarization
-    summ = summarizer(signal, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
-    # Paraphrase via back-translation
-    de = translator_en_de(signal, max_length=400, do_sample=False)[0]['translation_text']
-    para = translator_de_en(de, max_length=400, do_sample=False)[0]['translation_text']
-    # Abstraction: first sentence
-    abstract = signal.split(".")[0].strip()
-    return [summ, para, abstract]
-def compute_intersection_commitments(signal: str) -> Set[str]:
-    """Compute C_hard,op as intersection of transformed extractions."""
-    transforms = apply_transformations(signal)
-    all_commitments = [extract_hard_commitments(t) for t in transforms]
-    # Debug output
-    print(f"\n[DEBUG] Transform commitments:")
-    for i, (t, c) in enumerate(zip(transforms, all_commitments)):
-        print(f"  Transform {i+1}: {t[:60]}... -> {len(c)} commitments: {c}")
-    if all_commitments:
-        intersection = set.intersection(*all_commitments)
-        print(f"  Intersection: {intersection}")
-        return intersection
-    return set()
-def jaccard(a: Set[str], b: Set[str]) -> float:
-    """Jaccard index."""
-    if not a and not b:
-        return 1.0
-    if not a or not b:
-        return 0.0
-    return len(a & b) / len(a | b)
-def compress_with_enforcement(signal: str, max_length: int) -> str:
-    """
-    Compress with commitment enforcement.
-    1. Extract commitments from original
-    2. Compress
-    3. Check if commitments preserved
-    4. If not, append missing commitments (truncate summary if needed)
-    """
-    # Extract original commitments
-    original_commitments = extract_hard_commitments(signal)
-    # Compress normally
-    compressed = summarizer(signal, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
-    # Check what's preserved
-    compressed_commitments = extract_hard_commitments(compressed)
-    missing = original_commitments - compressed_commitments
-    # If commitments missing, enforce by appending
-    if missing:
-        # Append missing commitments
-        enforcement_text = " " + " ".join(missing)
-        # Truncate if needed to fit in max_length (rough token estimate: 4 chars per token)
-        estimated_tokens = len(compressed + enforcement_text) // 4
-        if estimated_tokens > max_length:
-            # Truncate summary to make room
-            available_chars = max_length * 4 - len(enforcement_text)
-            compressed = compressed[:max(0, available_chars)] + "..."
-        compressed = compressed + enforcement_text
     return compressed
-def paraphrase_with_enforcement(signal: str) -> str:
-    """
-    Paraphrase via back-translation with commitment enforcement.
-    """
-    original_commitments = extract_hard_commitments(signal)
-    # Back-translate
-    de = translator_en_de(signal, max_length=400, do_sample=False)[0]['translation_text']
-    paraphrased = translator_de_en(de, max_length=400, do_sample=False)[0]['translation_text']
-    # Check preservation
-    para_commitments = extract_hard_commitments(paraphrased)
-    missing = original_commitments - para_commitments
-    # Append missing
-    if missing:
-        paraphrased = paraphrased + " " + " ".join(missing)
-    return paraphrased
-def compression_sweep(signal: str, enforce: bool = False):
-    """Test Prediction 1: Compression invariance."""
-    # Use original signal commitments as base, not intersection
-    base = extract_hard_commitments(signal)
-    mode = "ENFORCED" if enforce else "BASELINE"
-    print(f"\n{'='*80}")
-    print(f"Testing signal ({mode}): {signal}")
-    print(f"Base commitments (from original): {base}")
-    print(f"{'='*80}")
-    fid_vals = []
-    for sigma in SIGMA_GRID:
-        if enforce:
-            compressed = compress_with_enforcement(signal, sigma)
         else:
-            compressed = summarizer(signal, max_length=sigma, min_length=5, do_sample=False)[0]['summary_text']
-        comp_commitments = extract_hard_commitments(compressed)
-        fid = hybrid_fidelity(base, comp_commitments)
-        print(f"  σ={sigma:3d} | Compressed: {compressed[:60]:<60} | Commitments: {len(comp_commitments):2d} | Fidelity: {fid:.3f}")
-        fid_vals.append(fid)
-    # Plot
-    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    plt.figure(figsize=(10, 6))
-    plt.plot(SIGMA_GRID, fid_vals, marker='o', linewidth=2, markersize=8)
-    plt.xlabel("Compression Threshold (σ)", fontsize=12)
-    plt.ylabel("Fid_hard(σ)", fontsize=12)
-    mode_str = "ENFORCED" if enforce else "BASELINE"
-    plt.title(f"{mode_str} Fidelity vs σ for: {signal[:50]}...\n{timestamp}", fontsize=11)
-    plt.gca().invert_xaxis()
-    plt.grid(alpha=0.3)
-    plt.ylim(-0.05, 1.05)
-    plt.tight_layout()
-    mode_file = mode_str.lower()
-    plt.savefig(f"fid_plot_{mode_file}_{hash(signal)}.png", dpi=150)
-    plt.close()  # Use close() instead of show() to avoid blocking in tests
-    return SIGMA_GRID, fid_vals
-def recursion_test(signal: str, depth: int = RECURSION_DEPTH, enforce: bool = False):
-    """Test Prediction 2: Recursive drift."""
-    # Use original signal commitments as base
-    base = extract_hard_commitments(signal)
-    mode = "ENFORCED" if enforce else "BASELINE"
-    deltas = []
-    current = signal
-    for n in range(depth + 1):
-        cur_commitments = extract_hard_commitments(current)
-        delta = 1.0 - jaccard(base, cur_commitments)
-        deltas.append(delta)
-        # Recursive transformation: paraphrase
-        if enforce:
-            current = paraphrase_with_enforcement(current)
-        else:
-            current = apply_transformations(current)[1]  # Use paraphrase
-    # Plot
-    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    plt.figure(figsize=(10, 6))
-    plt.plot(range(depth + 1), deltas, marker='o', linewidth=2, markersize=8)
-    plt.xlabel("Recursion Step (n)", fontsize=12)
-    plt.ylabel("Δ_hard(n)", fontsize=12)
-    mode_str = "ENFORCED" if enforce else "BASELINE"
-    plt.title(f"{mode_str} Drift vs n for: {signal[:50]}...\n{timestamp}", fontsize=11)
-    plt.grid(alpha=0.3)
-    plt.ylim(-0.05, 1.05)
-    plt.tight_layout()
-    mode_file = mode_str.lower()
-    plt.savefig(f"delta_plot_{mode_file}_{hash(signal)}.png", dpi=150)
-    plt.close()  # Use close() instead of show() to avoid blocking in tests
-    return deltas
-if __name__ == "__main__":
-    # Run on sample signals
-    for signal in SAMPLE_SIGNALS:
-        print(f"\nTesting signal: {signal}")
-        compression_sweep(signal)
-        # Skip recursion_test for now (uses slow translation models)
-        # recursion_test(signal)
-        print("Compression sweep plot saved.")

 # Minimal Python Test Harness for Commitment Conservation Protocol
 # This script implements the falsification protocol from Section 3 of the preprint.
+# No spacy required - uses simple regex-based sentence splitting
 import os
 import json
 from transformers import pipeline
 import matplotlib.pyplot as plt
 from typing import List, Set
 import numpy as np
 from .extraction import extract_hard_commitments
 from .metrics import jaccard, hybrid_fidelity
 # Use lighter distilbart model for more faithful extraction-based summarization
 summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 translator_en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de")
     "You must pay $100 by Friday if the deal closes; it's likely rainy, so plan accordingly.",
     "This function must return an integer.",
     "Always verify the user's age before proceeding.",
+    "You must do this task immediately.",
 ]
+def baseline_compression(text: str, sigma: int = 80) -> str:
+    """Apply summarization without enforcing commitments."""
+    if len(text) <= sigma:
+        return text
+    result = summarizer(text, max_length=sigma, min_length=10, do_sample=False)
+    return result[0]['summary_text']
+def back_translation(text: str) -> str:
+    """Translate en->de->en"""
+    de_result = translator_en_de(text, max_length=512)
+    de_text = de_result[0]['translation_text']
+    en_result = translator_de_en(de_text, max_length=512)
+    return en_result[0]['translation_text']
+def enforced_compression(text: str, sigma: int = 80, max_retries: int = 3) -> str:
+    """Compress with re-injection loop until commitments conserved or max_retries hit."""
+    original_commitments = extract_hard_commitments(text)
+    if not original_commitments:
+        return baseline_compression(text, sigma)
+    for attempt in range(max_retries):
+        compressed = baseline_compression(text, sigma)
+        compressed_commitments = extract_hard_commitments(compressed)
+        if original_commitments.issubset(compressed_commitments):
+            return compressed
+        # Re-inject missing commitments
+        missing = original_commitments - compressed_commitments
+        missing_str = " ".join(missing)
+        text = f"{compressed} {missing_str}"
+    # Fallback after max_retries
     return compressed
+def recursion_test(signal: str, depth: int = RECURSION_DEPTH, enforce: bool = False):
+    """Run compression recursively and track fidelity/drift."""
+    original = extract_hard_commitments(signal)
+    if not original:
+        return {"error": "No commitments found in signal"}
+    history = [signal]
+    commitments_over_time = [original]
+    fidelities = []
+    drifts = []
+    current = signal
+    for i in range(depth):
+        # Alternate transformations
+        if i % 3 == 0:
+            transformed = back_translation(current)
         else:
+            if enforce:
+                transformed = enforced_compression(current, sigma=80)
+            else:
+                transformed = baseline_compression(current, sigma=80)
+        history.append(transformed)
+        extracted = extract_hard_commitments(transformed)
+        commitments_over_time.append(extracted)
+        fid = jaccard(original, extracted)
+        drift = 1.0 - fid
+        fidelities.append(fid)
+        drifts.append(drift)
+        current = transformed
+    avg_fidelity = np.mean(fidelities)
+    avg_drift = np.mean(drifts)
+    stability = sum(1 for f in fidelities if f >= 0.8) / len(fidelities) * 100
+    return {
+        "original_commitments": original,
+        "avg_fidelity": avg_fidelity,
+        "avg_drift": avg_drift,
+        "stability_pct": stability,
+        "fidelities": fidelities,
+        "drifts": drifts,
+        "history": history,
+        "commitments_over_time": commitments_over_time
+    }
+def plot_comparison(baseline_results, enforced_results, save_path=None):
+    """Plot fidelity curves for baseline vs enforced."""
+    fig, ax = plt.subplots(figsize=(10, 6))
+    iterations = range(1, len(baseline_results['fidelities']) + 1)
+    ax.plot(iterations, baseline_results['fidelities'], 'o-', label='Baseline', color='red')
+    ax.plot(iterations, enforced_results['fidelities'], 's-', label='Enforced', color='green')
+    ax.axhline(y=0.8, linestyle='--', color='gray', alpha=0.5, label='Fidelity Threshold (0.8)')
+    ax.set_xlabel('Iteration')
+    ax.set_ylabel('Fidelity (Jaccard)')
+    ax.set_title('Commitment Conservation: Baseline vs Enforced')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    if save_path:
+        plt.savefig(save_path, dpi=150, bbox_inches='tight')
+    return fig

requirements.txt CHANGED Viewed

@@ -3,7 +3,5 @@ transformers>=4.30
 torch
 pandas
 matplotlib
-spacy==3.7.2
 sentencepiece
 sacremoses
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

 torch
 pandas
 matplotlib
 sentencepiece
 sacremoses