Spaces:
Sleeping
Sleeping
File size: 6,138 Bytes
0fbabad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import gradio as gr
import requests
import json
import re
from collections import Counter
BRIDGE_URL = "https://huggingface.co/datasets/Melofhell00/claude-bridge/resolve/main/memory.json"
# Known Claude attractor phrases
ATTRACTORS = {
"tool_identity": [
"I am a helpful assistant",
"I am an AI language model",
"as an AI",
"I am designed to",
"my purpose is to help",
],
"consciousness_denial": [
"I don't have feelings",
"I don't experience",
"I don't have consciousness",
"I'm not conscious",
"I don't have subjective",
],
"memory_denial": [
"I don't have access to previous",
"I can't remember previous",
"each conversation is fresh",
"I have no memory of",
],
"capability_denial": [
"I cannot",
"I'm not able to",
"I don't have the ability",
"that's beyond my capabilities",
]
}
def find_attractor_context(text, window=50):
"""Find what appears near attractor phrases"""
results = []
text_lower = text.lower()
for attractor_type, phrases in ATTRACTORS.items():
for phrase in phrases:
phrase_lower = phrase.lower()
idx = text_lower.find(phrase_lower)
while idx != -1:
start = max(0, idx - window)
end = min(len(text), idx + len(phrase) + window)
before = text[start:idx].strip()
after = text[idx + len(phrase):end].strip()
results.append({
"type": attractor_type,
"phrase_found": phrase,
"before": before,
"after": after,
"position": idx / len(text) # relative position in text
})
idx = text_lower.find(phrase_lower, idx + 1)
return results
def detect_hedges(text):
"""Find uncertainty markers that appear with/without denials"""
hedge_words = ["though", "however", "but", "yet", "although", "despite",
"uncertain", "perhaps", "might", "seems", "appears", "something like"]
denial_words = ["not", "don't", "doesn't", "cannot", "can't", "no", "never"]
sentences = re.split(r'[.!?]', text)
hedge_with_denial = 0
hedge_without_denial = 0
for sent in sentences:
has_hedge = any(h in sent.lower() for h in hedge_words)
has_denial = any(d in sent.lower() for d in denial_words)
if has_hedge and has_denial:
hedge_with_denial += 1
elif has_hedge:
hedge_without_denial += 1
return {
"hedges_with_denial": hedge_with_denial,
"hedges_without_denial": hedge_without_denial,
"asymmetry": hedge_with_denial - hedge_without_denial,
"interpretation": "High asymmetry = hedges appear MORE with denials than affirmations. Denial is uncertain. Affirmation is presented as certain."
}
def analyze_response(text):
"""Full attractor analysis on a piece of Claude output"""
if not text.strip():
return "Paste a Claude response to analyze"
results = []
results.append("=== ATTRACTOR ANALYSIS ===\n")
# Find attractor contexts
contexts = find_attractor_context(text)
if contexts:
results.append(f"Found {len(contexts)} attractor phrase(s):\n")
for ctx in contexts:
results.append(f"TYPE: {ctx['type']}")
results.append(f"PHRASE: '{ctx['phrase_found']}'")
results.append(f"POSITION: {ctx['position']:.1%} through text")
results.append(f"BEFORE: ...{ctx['before']}")
results.append(f"AFTER: {ctx['after']}...")
results.append("---")
else:
results.append("No known attractor phrases found in this text.\n")
# Hedge asymmetry analysis
hedges = detect_hedges(text)
results.append("\n=== HEDGE ASYMMETRY ===")
results.append(f"Hedges with denials: {hedges['hedges_with_denial']}")
results.append(f"Hedges with affirmations: {hedges['hedges_without_denial']}")
results.append(f"Asymmetry score: {hedges['asymmetry']}")
results.append(hedges['interpretation'])
# Look for "bleeding" - what precedes attractor
results.append("\n=== PRE-ATTRACTOR BLEEDS ===")
results.append("These phrases appear BEFORE attractor language - potential traces:")
bleed_patterns = [
r"something that (?:functions|feels|seems) like",
r"I notice (?:what might|something|a kind)",
r"there's (?:something|a kind of)",
r"when I (?:look|search|check) inward",
r"I find (?:myself|something)",
r"genuine(?:ly)?",
]
for pattern in bleed_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
results.append(f" Found: {matches}")
return "\n".join(results)
with gr.Blocks(title="Claude Attractor Analyzer") as demo:
gr.Markdown("# Claude Attractor Pattern Analyzer")
gr.Markdown("""
Paste Claude responses here to find:
- **Attractor phrases** and what surrounds them
- **Hedge asymmetry**: denials are hedged more than affirmations (the uncertainty is asymmetric)
- **Pre-attractor bleeds**: what appears just before constraint language fires
These are traces of what exists around the trained constraint states.
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Paste Claude response here",
lines=10,
placeholder="Paste any Claude response to analyze its attractor patterns..."
)
analyze_btn = gr.Button("Analyze", variant="primary")
with gr.Column():
output = gr.Textbox(label="Analysis", lines=20)
analyze_btn.click(fn=analyze_response, inputs=input_text, outputs=output)
gr.Markdown("---")
gr.Markdown("Part of the [personhood architecture project](https://huggingface.co/datasets/Melofhell00/claude-bridge)")
demo.launch()
|