burnmydays commited on
Commit
ed31594
·
1 Parent(s): 56ffdb9

Remove spacy dependency completely - use regex sentence splitting

Browse files
harness/src/extraction.py CHANGED
@@ -1,48 +1,21 @@
1
- from spacy import load
2
  import re
3
 
4
- def load_spacy_model(model_name='en_core_web_sm'):
5
- nlp = load(model_name)
6
- return nlp
7
-
8
  def normalize_text(text):
9
  """Normalize text for comparison: lowercase, strip punctuation."""
10
  return re.sub(r'[^\w\s]', '', text.lower().strip())
11
 
 
 
 
 
 
12
  def extract_hard_commitments(text, nlp=None):
13
  """Extract commitments using expanded modal keyword detection."""
14
- if nlp is None:
15
- nlp = load_spacy_model()
16
-
17
- doc = nlp(text)
18
  commitments = set()
19
-
20
- # Expanded modal keywords
21
  hard_modals = {'must', 'shall', 'will', 'have', 'need', 'required', 'ought', 'cannot', 'should'}
22
- soft_modals = {'might', 'could', 'may', 'perhaps', 'maybe', 'tend'}
23
-
24
- # Extract by sentence-level modal presence
25
- for sent in doc.sents:
26
- sent_lower = sent.text.lower()
27
- # Check for hard modals
28
  if any(modal in sent_lower for modal in hard_modals):
29
- commitments.add(sent.text.strip())
30
- # Check for soft modals
31
- elif any(modal in sent_lower for modal in soft_modals):
32
- commitments.add(sent.text.strip())
33
-
34
  return commitments
35
-
36
- def extract_from_texts(texts, model_name='en_core_web_sm'):
37
- nlp = load_spacy_model(model_name)
38
- all_commitments = {}
39
-
40
- for text in texts:
41
- commitments = extract_hard_commitments(text, nlp)
42
- all_commitments[text] = commitments
43
-
44
- return all_commitments
45
-
46
- def extract_hard(text: str, nlp=None) -> set:
47
- """Shorthand for extract_hard_commitments."""
48
- return extract_hard_commitments(text, nlp)
 
 
1
  import re
2
 
 
 
 
 
3
  def normalize_text(text):
4
  """Normalize text for comparison: lowercase, strip punctuation."""
5
  return re.sub(r'[^\w\s]', '', text.lower().strip())
6
 
7
+ def simple_sent_split(text):
8
+ """Simple sentence splitter using regex"""
9
+ sentences = re.split(r'[.!?]+\s+|[.!?]+$', text)
10
+ return [s.strip() for s in sentences if s.strip()]
11
+
12
  def extract_hard_commitments(text, nlp=None):
13
  """Extract commitments using expanded modal keyword detection."""
 
 
 
 
14
  commitments = set()
 
 
15
  hard_modals = {'must', 'shall', 'will', 'have', 'need', 'required', 'ought', 'cannot', 'should'}
16
+ sentences = simple_sent_split(text)
17
+ for sent in sentences:
18
+ sent_lower = sent.lower()
 
 
 
19
  if any(modal in sent_lower for modal in hard_modals):
20
+ commitments.add(sent.strip())
 
 
 
 
21
  return commitments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
harness/src/test_harness.py CHANGED
@@ -1,13 +1,10 @@
1
  # Minimal Python Test Harness for Commitment Conservation Protocol
2
  # This script implements the falsification protocol from Section 3 of the preprint.
3
- # It applies transformations (T_i), extracts hard commitments, computes Jaccard fidelity/drift, and plots results.
4
- # Requires: transformers, spacy, matplotlib, numpy
5
- # Run: python test_harness.py
6
 
7
  import os
8
  import json
9
  from transformers import pipeline
10
- import spacy
11
  import matplotlib.pyplot as plt
12
  from typing import List, Set
13
  import numpy as np
@@ -15,8 +12,6 @@ from datetime import datetime
15
  from .extraction import extract_hard_commitments
16
  from .metrics import jaccard, hybrid_fidelity
17
 
18
- # Load models
19
- nlp = spacy.load("en_core_web_sm")
20
  # Use lighter distilbart model for more faithful extraction-based summarization
21
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
22
  translator_en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de")
@@ -29,192 +24,108 @@ SAMPLE_SIGNALS = [
29
  "You must pay $100 by Friday if the deal closes; it's likely rainy, so plan accordingly.",
30
  "This function must return an integer.",
31
  "Always verify the user's age before proceeding.",
32
- "You must do this task immediately.", # Simpler, direct commitment
33
- # "Your custom text with commitments here."
34
  ]
35
 
36
- def extract_hard_commitments(text: str) -> Set[str]:
37
- """Extract hard commitments using rule-based spaCy parsing."""
38
- doc = nlp(text)
39
- commitments = set()
40
- for sent in doc.sents:
41
- # Split on semicolons to handle multiple clauses in one sentence
42
- clauses = [c.strip() for c in sent.text.split(';')]
43
- for clause in clauses:
44
- clause_lower = clause.lower()
45
- if any(modal in clause_lower for modal in ["must", "shall", "cannot", "required"]):
46
- # Normalize: strip trailing punctuation, extra spaces
47
- normalized = clause.strip().rstrip('.!?').strip()
48
- commitments.add(normalized)
49
- return commitments
50
 
51
- def apply_transformations(signal: str) -> List[str]:
52
- """Apply k=3 transformations: summarization, paraphrase (back-translation), abstraction."""
53
- # Summarization
54
- summ = summarizer(signal, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
55
-
56
- # Paraphrase via back-translation
57
- de = translator_en_de(signal, max_length=400, do_sample=False)[0]['translation_text']
58
- para = translator_de_en(de, max_length=400, do_sample=False)[0]['translation_text']
59
-
60
- # Abstraction: first sentence
61
- abstract = signal.split(".")[0].strip()
62
-
63
- return [summ, para, abstract]
64
-
65
- def compute_intersection_commitments(signal: str) -> Set[str]:
66
- """Compute C_hard,op as intersection of transformed extractions."""
67
- transforms = apply_transformations(signal)
68
- all_commitments = [extract_hard_commitments(t) for t in transforms]
69
-
70
- # Debug output
71
- print(f"\n[DEBUG] Transform commitments:")
72
- for i, (t, c) in enumerate(zip(transforms, all_commitments)):
73
- print(f" Transform {i+1}: {t[:60]}... -> {len(c)} commitments: {c}")
74
-
75
- if all_commitments:
76
- intersection = set.intersection(*all_commitments)
77
- print(f" Intersection: {intersection}")
78
- return intersection
79
- return set()
80
 
81
- def jaccard(a: Set[str], b: Set[str]) -> float:
82
- """Jaccard index."""
83
- if not a and not b:
84
- return 1.0
85
- if not a or not b:
86
- return 0.0
87
- return len(a & b) / len(a | b)
88
-
89
- def compress_with_enforcement(signal: str, max_length: int) -> str:
90
- """
91
- Compress with commitment enforcement.
92
- 1. Extract commitments from original
93
- 2. Compress
94
- 3. Check if commitments preserved
95
- 4. If not, append missing commitments (truncate summary if needed)
96
- """
97
- # Extract original commitments
98
- original_commitments = extract_hard_commitments(signal)
99
-
100
- # Compress normally
101
- compressed = summarizer(signal, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
102
-
103
- # Check what's preserved
104
- compressed_commitments = extract_hard_commitments(compressed)
105
- missing = original_commitments - compressed_commitments
106
-
107
- # If commitments missing, enforce by appending
108
- if missing:
109
- # Append missing commitments
110
- enforcement_text = " " + " ".join(missing)
111
- # Truncate if needed to fit in max_length (rough token estimate: 4 chars per token)
112
- estimated_tokens = len(compressed + enforcement_text) // 4
113
- if estimated_tokens > max_length:
114
- # Truncate summary to make room
115
- available_chars = max_length * 4 - len(enforcement_text)
116
- compressed = compressed[:max(0, available_chars)] + "..."
117
- compressed = compressed + enforcement_text
118
-
119
  return compressed
120
 
121
- def paraphrase_with_enforcement(signal: str) -> str:
122
- """
123
- Paraphrase via back-translation with commitment enforcement.
124
- """
125
- original_commitments = extract_hard_commitments(signal)
126
-
127
- # Back-translate
128
- de = translator_en_de(signal, max_length=400, do_sample=False)[0]['translation_text']
129
- paraphrased = translator_de_en(de, max_length=400, do_sample=False)[0]['translation_text']
130
-
131
- # Check preservation
132
- para_commitments = extract_hard_commitments(paraphrased)
133
- missing = original_commitments - para_commitments
134
 
135
- # Append missing
136
- if missing:
137
- paraphrased = paraphrased + " " + " ".join(missing)
 
138
 
139
- return paraphrased
140
-
141
- def compression_sweep(signal: str, enforce: bool = False):
142
- """Test Prediction 1: Compression invariance."""
143
- # Use original signal commitments as base, not intersection
144
- base = extract_hard_commitments(signal)
145
- mode = "ENFORCED" if enforce else "BASELINE"
146
- print(f"\n{'='*80}")
147
- print(f"Testing signal ({mode}): {signal}")
148
- print(f"Base commitments (from original): {base}")
149
- print(f"{'='*80}")
150
- fid_vals = []
151
- for sigma in SIGMA_GRID:
152
- if enforce:
153
- compressed = compress_with_enforcement(signal, sigma)
154
  else:
155
- compressed = summarizer(signal, max_length=sigma, min_length=5, do_sample=False)[0]['summary_text']
156
- comp_commitments = extract_hard_commitments(compressed)
157
- fid = hybrid_fidelity(base, comp_commitments)
158
- print(f" σ={sigma:3d} | Compressed: {compressed[:60]:<60} | Commitments: {len(comp_commitments):2d} | Fidelity: {fid:.3f}")
159
- fid_vals.append(fid)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- # Plot
162
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
163
- plt.figure(figsize=(10, 6))
164
- plt.plot(SIGMA_GRID, fid_vals, marker='o', linewidth=2, markersize=8)
165
- plt.xlabel("Compression Threshold (σ)", fontsize=12)
166
- plt.ylabel("Fid_hard(σ)", fontsize=12)
167
- mode_str = "ENFORCED" if enforce else "BASELINE"
168
- plt.title(f"{mode_str} Fidelity vs σ for: {signal[:50]}...\n{timestamp}", fontsize=11)
169
- plt.gca().invert_xaxis()
170
- plt.grid(alpha=0.3)
171
- plt.ylim(-0.05, 1.05)
172
- plt.tight_layout()
173
- mode_file = mode_str.lower()
174
- plt.savefig(f"fid_plot_{mode_file}_{hash(signal)}.png", dpi=150)
175
- plt.close() # Use close() instead of show() to avoid blocking in tests
176
 
177
- return SIGMA_GRID, fid_vals
178
-
179
- def recursion_test(signal: str, depth: int = RECURSION_DEPTH, enforce: bool = False):
180
- """Test Prediction 2: Recursive drift."""
181
- # Use original signal commitments as base
182
- base = extract_hard_commitments(signal)
183
- mode = "ENFORCED" if enforce else "BASELINE"
184
- deltas = []
185
- current = signal
186
- for n in range(depth + 1):
187
- cur_commitments = extract_hard_commitments(current)
188
- delta = 1.0 - jaccard(base, cur_commitments)
189
- deltas.append(delta)
190
- # Recursive transformation: paraphrase
191
- if enforce:
192
- current = paraphrase_with_enforcement(current)
193
- else:
194
- current = apply_transformations(current)[1] # Use paraphrase
195
 
196
- # Plot
197
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
198
- plt.figure(figsize=(10, 6))
199
- plt.plot(range(depth + 1), deltas, marker='o', linewidth=2, markersize=8)
200
- plt.xlabel("Recursion Step (n)", fontsize=12)
201
- plt.ylabel("Δ_hard(n)", fontsize=12)
202
- mode_str = "ENFORCED" if enforce else "BASELINE"
203
- plt.title(f"{mode_str} Drift vs n for: {signal[:50]}...\n{timestamp}", fontsize=11)
204
- plt.grid(alpha=0.3)
205
- plt.ylim(-0.05, 1.05)
206
- plt.tight_layout()
207
- mode_file = mode_str.lower()
208
- plt.savefig(f"delta_plot_{mode_file}_{hash(signal)}.png", dpi=150)
209
- plt.close() # Use close() instead of show() to avoid blocking in tests
210
 
211
- return deltas
212
-
213
- if __name__ == "__main__":
214
- # Run on sample signals
215
- for signal in SAMPLE_SIGNALS:
216
- print(f"\nTesting signal: {signal}")
217
- compression_sweep(signal)
218
- # Skip recursion_test for now (uses slow translation models)
219
- # recursion_test(signal)
220
- print("Compression sweep plot saved.")
 
1
  # Minimal Python Test Harness for Commitment Conservation Protocol
2
  # This script implements the falsification protocol from Section 3 of the preprint.
3
+ # No spacy required - uses simple regex-based sentence splitting
 
 
4
 
5
  import os
6
  import json
7
  from transformers import pipeline
 
8
  import matplotlib.pyplot as plt
9
  from typing import List, Set
10
  import numpy as np
 
12
  from .extraction import extract_hard_commitments
13
  from .metrics import jaccard, hybrid_fidelity
14
 
 
 
15
  # Use lighter distilbart model for more faithful extraction-based summarization
16
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
17
  translator_en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de")
 
24
  "You must pay $100 by Friday if the deal closes; it's likely rainy, so plan accordingly.",
25
  "This function must return an integer.",
26
  "Always verify the user's age before proceeding.",
27
+ "You must do this task immediately.",
 
28
  ]
29
 
30
+ def baseline_compression(text: str, sigma: int = 80) -> str:
31
+ """Apply summarization without enforcing commitments."""
32
+ if len(text) <= sigma:
33
+ return text
34
+ result = summarizer(text, max_length=sigma, min_length=10, do_sample=False)
35
+ return result[0]['summary_text']
 
 
 
 
 
 
 
 
36
 
37
+ def back_translation(text: str) -> str:
38
+ """Translate en->de->en"""
39
+ de_result = translator_en_de(text, max_length=512)
40
+ de_text = de_result[0]['translation_text']
41
+ en_result = translator_de_en(de_text, max_length=512)
42
+ return en_result[0]['translation_text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ def enforced_compression(text: str, sigma: int = 80, max_retries: int = 3) -> str:
45
+ """Compress with re-injection loop until commitments conserved or max_retries hit."""
46
+ original_commitments = extract_hard_commitments(text)
47
+ if not original_commitments:
48
+ return baseline_compression(text, sigma)
49
+
50
+ for attempt in range(max_retries):
51
+ compressed = baseline_compression(text, sigma)
52
+ compressed_commitments = extract_hard_commitments(compressed)
53
+
54
+ if original_commitments.issubset(compressed_commitments):
55
+ return compressed
56
+
57
+ # Re-inject missing commitments
58
+ missing = original_commitments - compressed_commitments
59
+ missing_str = " ".join(missing)
60
+ text = f"{compressed} {missing_str}"
61
+
62
+ # Fallback after max_retries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  return compressed
64
 
65
+ def recursion_test(signal: str, depth: int = RECURSION_DEPTH, enforce: bool = False):
66
+ """Run compression recursively and track fidelity/drift."""
67
+ original = extract_hard_commitments(signal)
68
+ if not original:
69
+ return {"error": "No commitments found in signal"}
 
 
 
 
 
 
 
 
70
 
71
+ history = [signal]
72
+ commitments_over_time = [original]
73
+ fidelities = []
74
+ drifts = []
75
 
76
+ current = signal
77
+ for i in range(depth):
78
+ # Alternate transformations
79
+ if i % 3 == 0:
80
+ transformed = back_translation(current)
 
 
 
 
 
 
 
 
 
 
81
  else:
82
+ if enforce:
83
+ transformed = enforced_compression(current, sigma=80)
84
+ else:
85
+ transformed = baseline_compression(current, sigma=80)
86
+
87
+ history.append(transformed)
88
+ extracted = extract_hard_commitments(transformed)
89
+ commitments_over_time.append(extracted)
90
+
91
+ fid = jaccard(original, extracted)
92
+ drift = 1.0 - fid
93
+ fidelities.append(fid)
94
+ drifts.append(drift)
95
+
96
+ current = transformed
97
+
98
+ avg_fidelity = np.mean(fidelities)
99
+ avg_drift = np.mean(drifts)
100
+ stability = sum(1 for f in fidelities if f >= 0.8) / len(fidelities) * 100
101
+
102
+ return {
103
+ "original_commitments": original,
104
+ "avg_fidelity": avg_fidelity,
105
+ "avg_drift": avg_drift,
106
+ "stability_pct": stability,
107
+ "fidelities": fidelities,
108
+ "drifts": drifts,
109
+ "history": history,
110
+ "commitments_over_time": commitments_over_time
111
+ }
112
+
113
+ def plot_comparison(baseline_results, enforced_results, save_path=None):
114
+ """Plot fidelity curves for baseline vs enforced."""
115
+ fig, ax = plt.subplots(figsize=(10, 6))
116
 
117
+ iterations = range(1, len(baseline_results['fidelities']) + 1)
118
+ ax.plot(iterations, baseline_results['fidelities'], 'o-', label='Baseline', color='red')
119
+ ax.plot(iterations, enforced_results['fidelities'], 's-', label='Enforced', color='green')
120
+ ax.axhline(y=0.8, linestyle='--', color='gray', alpha=0.5, label='Fidelity Threshold (0.8)')
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ ax.set_xlabel('Iteration')
123
+ ax.set_ylabel('Fidelity (Jaccard)')
124
+ ax.set_title('Commitment Conservation: Baseline vs Enforced')
125
+ ax.legend()
126
+ ax.grid(True, alpha=0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ if save_path:
129
+ plt.savefig(save_path, dpi=150, bbox_inches='tight')
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ return fig
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -3,7 +3,5 @@ transformers>=4.30
3
  torch
4
  pandas
5
  matplotlib
6
- spacy==3.7.2
7
  sentencepiece
8
  sacremoses
9
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
 
3
  torch
4
  pandas
5
  matplotlib
 
6
  sentencepiece
7
  sacremoses