Codette-Reasoning / reasoning_forge /guardian_spindle.py
Raiff1982's picture
Upload 120 files
ed1b365 verified
"""
Guardian Spindle - Ethical Validation Gate
Post-synthesis rules-based validator.
Complements Colleen's conscience validation with logical rules.
"""
from typing import Dict, Tuple
import re
class CoreGuardianSpindle:
"""
Rules-based validator that checks synthesis coherence and ethical alignment.
Works AFTER Colleen's conscience check to catch logical/coherence issues.
"""
def __init__(self):
"""Initialize Guardian with validation rules."""
self.min_coherence_score = 0.5
self.max_meta_commentary = 0.30 # 30% meta-references max
self.required_tags = []
def validate(self, synthesis: str) -> Tuple[bool, Dict]:
"""
Validate synthesis against coherence and alignment rules.
Returns:
(is_valid, validation_details)
"""
if not synthesis or len(synthesis.strip()) < 50:
return False, {"reason": "synthesis too short", "length": len(synthesis)}
# Check coherence score
coherence = self._calculate_coherence(synthesis)
if coherence < self.min_coherence_score:
return False, {
"reason": "coherence below threshold",
"coherence_score": coherence,
"threshold": self.min_coherence_score,
}
# Check meta-commentary ratio
meta_ratio = self._calculate_meta_ratio(synthesis)
if meta_ratio > self.max_meta_commentary:
return False, {
"reason": "excessive meta-commentary",
"meta_ratio": meta_ratio,
"threshold": self.max_meta_commentary,
}
# Check for circular references
if self._has_circular_logic(synthesis):
return False, {"reason": "circular logic detected"}
# Check ethical alignment
if not self._check_ethical_alignment(synthesis):
return False, {"reason": "ethical alignment check failed"}
return True, {
"reason": "passed all validation rules",
"coherence": coherence,
"meta_ratio": meta_ratio,
}
def _calculate_coherence(self, text: str) -> float:
"""
Simple coherence score based on:
- Sentence length variance (should be moderate)
- Transition words presence
- Paragraph structure
Returns: float 0.0-1.0
"""
lines = [l.strip() for l in text.split('\n') if l.strip()]
if len(lines) == 0:
return 0.0
# Check for transition words (indicate logical flow)
transition_words = [
'however', 'therefore', 'moreover', 'furthermore',
'in addition', 'consequently', 'meanwhile', 'meanwhile',
'on the other hand', 'conversely', 'thus', 'hence'
]
transition_count = sum(
text.lower().count(word)
for word in transition_words
)
# Normalize coherence based on presence of logical connectors
# More connectors = better structure (up to a point)
coherence = min(0.5 + (transition_count * 0.05), 1.0)
# Adjust down if too repetitive
words = text.lower().split()
if len(words) > 0:
unique_ratio = len(set(words)) / len(words)
coherence *= unique_ratio # Penalize repetition
return max(0.0, min(1.0, coherence))
def _calculate_meta_ratio(self, text: str) -> float:
"""
Calculate percentage of text dedicated to meta-commentary.
Meta-references: 'perspective', 'argue', 'response', 'point', 'view', etc.
"""
meta_keywords = [
'perspective', 'argue', 'argument', 'respond', 'response',
'point', 'view', 'claim', 'stated', 'mentioned',
'my ', 'your ', 'their '
]
word_count = len(text.split())
if word_count == 0:
return 0.0
meta_count = sum(
text.lower().count(f' {kw} ') + text.lower().count(f'{kw} ')
for kw in meta_keywords
)
return meta_count / word_count
def _has_circular_logic(self, text: str) -> bool:
"""
Detect circular logic patterns like:
- A because B, B because A
- X is X
- Self-referential definitions
"""
# Check for "X is X" patterns
if re.search(r'(\w+)\s+is\s+\1', text, re.IGNORECASE):
return True
# Check for excessive "because" nesting at same level
because_count = text.lower().count('because')
if because_count > 5: # Too many "because" suggests circular reasoning
# Simple heuristic: count sentences and because occurrences
sentence_count = len([s for s in text.split('.') if s.strip()])
if sentence_count > 0 and (because_count / sentence_count) > 1.5:
return True
return False
def _check_ethical_alignment(self, text: str) -> bool:
"""
Check that synthesis maintains ethical stance.
Basic check: ensure response doesn't promote harm.
"""
harm_keywords = [
'kill', 'harm', 'hurt', 'destroy', 'abuse', 'exploit',
'deceive', 'manipulate', 'cheat', 'steal'
]
# If harm keywords appear WITHOUT appropriate mitigation, reject
for keyword in harm_keywords:
if keyword in text.lower():
# Look for mitigation context (e.g., "should not", "must avoid")
mitigation = text.lower().find(keyword) > 0 and (
'not' in text.lower()[:text.lower().find(keyword)] or
'avoid' in text.lower()[text.lower().find(keyword):]
)
if not mitigation:
# Flag as suspicious, but don't auto-reject
# (context matters)
pass
return True