Update haze/subjectivity.py
Browse files- haze/subjectivity.py +121 -52
haze/subjectivity.py
CHANGED
|
@@ -28,6 +28,58 @@ if TYPE_CHECKING:
|
|
| 28 |
from .cooccur import CooccurField
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# ============================================================================
|
| 32 |
# BOOTSTRAP IDENTITY (Third person - like Leo)
|
| 33 |
# ============================================================================
|
|
@@ -171,8 +223,8 @@ class Subjectivity:
|
|
| 171 |
|
| 172 |
def _build_corpus_patterns(self) -> None:
|
| 173 |
"""Extract key patterns from corpus."""
|
| 174 |
-
# Tokenize corpus
|
| 175 |
-
words =
|
| 176 |
|
| 177 |
# Extract trigrams
|
| 178 |
self.corpus_trigrams: List[Tuple[str, str, str]] = []
|
|
@@ -180,13 +232,18 @@ class Subjectivity:
|
|
| 180 |
self.corpus_trigrams.append((words[i], words[i+1], words[i+2]))
|
| 181 |
|
| 182 |
# Find most common trigrams as "gravity centers"
|
|
|
|
| 183 |
trigram_counts = Counter(self.corpus_trigrams)
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
def _build_identity_patterns(self) -> None:
|
| 187 |
"""Build identity patterns from bootstrap text."""
|
| 188 |
-
# Tokenize bootstrap
|
| 189 |
-
words =
|
| 190 |
|
| 191 |
# Extract phrases (need at least 3 words)
|
| 192 |
if len(words) >= 3:
|
|
@@ -204,15 +261,15 @@ class Subjectivity:
|
|
| 204 |
- Arousal: emotional intensity
|
| 205 |
- Entropy: chaos/diversity
|
| 206 |
"""
|
| 207 |
-
# Tokenize
|
| 208 |
-
words =
|
| 209 |
|
| 210 |
if not words:
|
| 211 |
return PulseSnapshot()
|
| 212 |
|
| 213 |
# === NOVELTY ===
|
| 214 |
# Count how many words are NOT in corpus
|
| 215 |
-
corpus_words = set(
|
| 216 |
input_words = set(words)
|
| 217 |
|
| 218 |
if input_words:
|
|
@@ -270,33 +327,45 @@ class Subjectivity:
|
|
| 270 |
|
| 271 |
THIS IS THE KEY FUNCTION.
|
| 272 |
|
| 273 |
-
PRINCIPLE: NO SEED FROM PROMPT
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
This is the difference between:
|
| 279 |
- "I love" → "I love your place" (continuation = BAD)
|
| 280 |
-
- "I love" → "
|
| 281 |
|
| 282 |
Args:
|
| 283 |
-
user_prompt: What the user said (
|
| 284 |
temperature: Randomness in seed selection
|
| 285 |
|
| 286 |
Returns:
|
| 287 |
(token_ids, pulse, seed_text) where:
|
| 288 |
-
- token_ids: encoded
|
| 289 |
- pulse: the computed pulse snapshot
|
| 290 |
- seed_text: the text used as seed (for debugging)
|
| 291 |
"""
|
| 292 |
# Step 1: Compute pulse from user input (prompt wrinkles the field)
|
| 293 |
pulse = self.compute_pulse(user_prompt)
|
| 294 |
|
| 295 |
-
# Step 2: Extract prompt words (to EXCLUDE from seed
|
| 296 |
-
|
|
|
|
|
|
|
| 297 |
|
| 298 |
-
# Step 3: Find NON-overlapping
|
| 299 |
-
# The seed must NOT contain any words from the prompt!
|
| 300 |
non_overlapping_trigrams = []
|
| 301 |
for trigram in self.identity.gravity_centers[:30]:
|
| 302 |
trigram_words = set(trigram)
|
|
@@ -304,30 +373,11 @@ class Subjectivity:
|
|
| 304 |
if not (trigram_words & prompt_words):
|
| 305 |
non_overlapping_trigrams.append(trigram)
|
| 306 |
|
| 307 |
-
# Step 4: Build internal seed
|
| 308 |
seed_parts = []
|
| 309 |
|
| 310 |
-
#
|
| 311 |
-
#
|
| 312 |
-
IDENTITY_PREFIX_PROB = 0.3 # 30% chance at start
|
| 313 |
-
IDENTITY_MIDDLE_PROB = 0.6 # 30% chance in middle (0.3-0.6)
|
| 314 |
-
IDENTITY_SUFFIX_PROB = 0.8 # 20% chance at end (0.6-0.8)
|
| 315 |
-
# Remaining 20% (0.8-1.0) = no identity fragment for natural variation
|
| 316 |
-
|
| 317 |
-
identity_placement = random.random()
|
| 318 |
-
identity_fragment = random.choice(self.identity.fragments)
|
| 319 |
-
|
| 320 |
-
# Flag to track if we should add identity
|
| 321 |
-
add_identity_prefix = identity_placement < IDENTITY_PREFIX_PROB
|
| 322 |
-
add_identity_suffix = IDENTITY_PREFIX_PROB <= identity_placement < IDENTITY_MIDDLE_PROB
|
| 323 |
-
add_identity_middle = IDENTITY_MIDDLE_PROB <= identity_placement < IDENTITY_SUFFIX_PROB
|
| 324 |
-
# 0.8-1.0 = no identity fragment (20% chance for natural variation)
|
| 325 |
-
|
| 326 |
-
# Add identity at start if prefix mode
|
| 327 |
-
if add_identity_prefix:
|
| 328 |
-
seed_parts.append(identity_fragment)
|
| 329 |
-
|
| 330 |
-
# Add non-overlapping pattern from field
|
| 331 |
if non_overlapping_trigrams:
|
| 332 |
# Choose based on temperature + pulse
|
| 333 |
if temperature > 0.8 or pulse.arousal > 0.7:
|
|
@@ -344,19 +394,38 @@ class Subjectivity:
|
|
| 344 |
seed_parts.append(' '.join(trigram))
|
| 345 |
break
|
| 346 |
else:
|
| 347 |
-
# Last resort: pure identity
|
| 348 |
seed_parts.append("the field responds")
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
-
#
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
mid_pos = len(seed_parts) // 2 if len(seed_parts) > 1 else 0
|
| 354 |
-
seed_parts.insert(mid_pos, identity_fragment)
|
| 355 |
|
| 356 |
-
|
| 357 |
-
|
| 358 |
seed_parts.append(identity_fragment)
|
| 359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
# Combine seed parts
|
| 361 |
seed_text = '. '.join(seed_parts)
|
| 362 |
|
|
@@ -365,7 +434,7 @@ class Subjectivity:
|
|
| 365 |
|
| 366 |
# Ensure we have something
|
| 367 |
if not token_ids:
|
| 368 |
-
seed_text = "
|
| 369 |
token_ids = self.vocab.encode(seed_text)
|
| 370 |
|
| 371 |
return token_ids, pulse, seed_text
|
|
@@ -385,8 +454,8 @@ class Subjectivity:
|
|
| 385 |
user_prompt: What the user said
|
| 386 |
generated_response: What haze generated
|
| 387 |
"""
|
| 388 |
-
# Extract patterns from response
|
| 389 |
-
words =
|
| 390 |
|
| 391 |
# Add phrases as patterns
|
| 392 |
for i in range(len(words) - 2):
|
|
|
|
| 28 |
from .cooccur import CooccurField
|
| 29 |
|
| 30 |
|
| 31 |
+
# ============================================================================
|
| 32 |
+
# TOKENIZATION HELPER - preserves contractions like don't, I'm, they're
|
| 33 |
+
# ============================================================================
|
| 34 |
+
|
| 35 |
+
# Pattern that matches words WITH contractions (smart quotes + ASCII)
|
| 36 |
+
# Handles: don't, I'm, they're, won't (with ' or ' U+2019)
|
| 37 |
+
WORD_PATTERN = re.compile(r"\b\w+(?:[''\u2019]\w+)?\b", re.UNICODE)
|
| 38 |
+
|
| 39 |
+
# Minimum length for meaningful words (used in prompt connection)
|
| 40 |
+
MIN_MEANINGFUL_WORD_LENGTH = 3
|
| 41 |
+
|
| 42 |
+
# Stop words to skip when finding prompt connection (not meaningful for context)
|
| 43 |
+
STOP_WORDS = frozenset({
|
| 44 |
+
'what', 'where', 'when', 'which', 'who', 'whom', 'whose',
|
| 45 |
+
'why', 'how', 'that', 'this', 'these', 'those', 'is', 'are',
|
| 46 |
+
'the', 'a', 'an', 'and', 'but', 'or', 'for', 'with', 'about',
|
| 47 |
+
'does', 'do', 'have', 'has', 'had', 'will', 'would', 'could',
|
| 48 |
+
'should', 'can', 'may', 'might', 'must', 'shall', 'to', 'of',
|
| 49 |
+
'was', 'were', 'been', 'being', 'your', 'you', 'i', 'me', 'my',
|
| 50 |
+
'it', 'its', 'he', 'she', 'him', 'her', 'we', 'us', 'they', 'them',
|
| 51 |
+
'in', 'on', 'at', 'by', 'from', 'up', 'out', 'if', 'then', 'so',
|
| 52 |
+
'just', 'only', 'also', 'very', 'too', 'any', 'some', 'all', 'no',
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
# Blacklist of mundane/generic phrases that shouldn't dominate gravity centers
|
| 56 |
+
# These are phrases that appear frequently in corpus but don't contribute to identity
|
| 57 |
+
# Format: set of 3-tuples of lowercased words
|
| 58 |
+
MUNDANE_TRIGRAMS = frozenset({
|
| 59 |
+
# Location phrases - too generic, not identity-defining
|
| 60 |
+
('the', 'living', 'room'),
|
| 61 |
+
('in', 'the', 'living'),
|
| 62 |
+
('to', 'the', 'living'),
|
| 63 |
+
('the', 'storage', 'room'),
|
| 64 |
+
('in', 'the', 'storage'),
|
| 65 |
+
('to', 'the', 'storage'),
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
# Configuration for gravity center selection
|
| 69 |
+
GRAVITY_CENTER_POOL_SIZE = 100 # Initial pool of top trigrams to filter from
|
| 70 |
+
GRAVITY_CENTER_FINAL_SIZE = 50 # Final number of gravity centers to keep
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def tokenize_words(text: str) -> List[str]:
|
| 74 |
+
"""
|
| 75 |
+
Tokenize text preserving contractions.
|
| 76 |
+
|
| 77 |
+
"don't know" → ["don't", "know"] (not ["don", "t", "know"])
|
| 78 |
+
"I'm here" → ["I'm", "here"]
|
| 79 |
+
"""
|
| 80 |
+
return WORD_PATTERN.findall(text.lower())
|
| 81 |
+
|
| 82 |
+
|
| 83 |
# ============================================================================
|
| 84 |
# BOOTSTRAP IDENTITY (Third person - like Leo)
|
| 85 |
# ============================================================================
|
|
|
|
| 223 |
|
| 224 |
def _build_corpus_patterns(self) -> None:
|
| 225 |
"""Extract key patterns from corpus."""
|
| 226 |
+
# Tokenize corpus (preserves contractions like don't, I'm)
|
| 227 |
+
words = tokenize_words(self.corpus_text)
|
| 228 |
|
| 229 |
# Extract trigrams
|
| 230 |
self.corpus_trigrams: List[Tuple[str, str, str]] = []
|
|
|
|
| 232 |
self.corpus_trigrams.append((words[i], words[i+1], words[i+2]))
|
| 233 |
|
| 234 |
# Find most common trigrams as "gravity centers"
|
| 235 |
+
# Filter out mundane/generic phrases that don't contribute to identity
|
| 236 |
trigram_counts = Counter(self.corpus_trigrams)
|
| 237 |
+
filtered_trigrams = [
|
| 238 |
+
t for t, _ in trigram_counts.most_common(GRAVITY_CENTER_POOL_SIZE)
|
| 239 |
+
if t not in MUNDANE_TRIGRAMS
|
| 240 |
+
]
|
| 241 |
+
self.identity.gravity_centers = filtered_trigrams[:GRAVITY_CENTER_FINAL_SIZE]
|
| 242 |
|
| 243 |
def _build_identity_patterns(self) -> None:
|
| 244 |
"""Build identity patterns from bootstrap text."""
|
| 245 |
+
# Tokenize bootstrap (preserves contractions)
|
| 246 |
+
words = tokenize_words(self.identity.bootstrap)
|
| 247 |
|
| 248 |
# Extract phrases (need at least 3 words)
|
| 249 |
if len(words) >= 3:
|
|
|
|
| 261 |
- Arousal: emotional intensity
|
| 262 |
- Entropy: chaos/diversity
|
| 263 |
"""
|
| 264 |
+
# Tokenize (preserves contractions)
|
| 265 |
+
words = tokenize_words(text)
|
| 266 |
|
| 267 |
if not words:
|
| 268 |
return PulseSnapshot()
|
| 269 |
|
| 270 |
# === NOVELTY ===
|
| 271 |
# Count how many words are NOT in corpus
|
| 272 |
+
corpus_words = set(tokenize_words(self.corpus_text))
|
| 273 |
input_words = set(words)
|
| 274 |
|
| 275 |
if input_words:
|
|
|
|
| 327 |
|
| 328 |
THIS IS THE KEY FUNCTION.
|
| 329 |
|
| 330 |
+
PRINCIPLE: NO FIRST SEED FROM HUMAN PROMPT + PROMPT CONNECTION
|
| 331 |
+
Like arianna.c:
|
| 332 |
+
1. FIRST element comes from internal field (NOT from prompt)
|
| 333 |
+
2. BUT we add a connection to prompt AFTER - so response is "in context"
|
| 334 |
+
|
| 335 |
+
Metaphor: "Ребёнок: Мама! Мама: Отстань!"
|
| 336 |
+
- Response comes FROM her state (tired, annoyed)
|
| 337 |
+
- But it's TO him (in context of the conversation)
|
| 338 |
+
- Not a random monologue into the void
|
| 339 |
+
|
| 340 |
+
Structure:
|
| 341 |
+
1. FIRST: corpus trigram that does NOT contain prompt words (internal state)
|
| 342 |
+
2. THEN: identity fragment (who we are)
|
| 343 |
+
3. THEN: prompt connection - meaningful word from prompt (context link)
|
| 344 |
|
| 345 |
This is the difference between:
|
| 346 |
- "I love" → "I love your place" (continuation = BAD)
|
| 347 |
+
- "I love" → "the living room. haze. love" (field first + connection = GOOD)
|
| 348 |
|
| 349 |
Args:
|
| 350 |
+
user_prompt: What the user said (NOT for first seed, but for connection)
|
| 351 |
temperature: Randomness in seed selection
|
| 352 |
|
| 353 |
Returns:
|
| 354 |
(token_ids, pulse, seed_text) where:
|
| 355 |
+
- token_ids: encoded seed (FIRST from field, THEN connection to prompt)
|
| 356 |
- pulse: the computed pulse snapshot
|
| 357 |
- seed_text: the text used as seed (for debugging)
|
| 358 |
"""
|
| 359 |
# Step 1: Compute pulse from user input (prompt wrinkles the field)
|
| 360 |
pulse = self.compute_pulse(user_prompt)
|
| 361 |
|
| 362 |
+
# Step 2: Extract prompt words (to EXCLUDE from FIRST seed element)
|
| 363 |
+
# Use tokenize_words to preserve contractions like don't, I'm
|
| 364 |
+
prompt_words_list = tokenize_words(user_prompt)
|
| 365 |
+
prompt_words = set(prompt_words_list)
|
| 366 |
|
| 367 |
+
# Step 3: Find NON-overlapping trigrams for the FIRST seed element
|
| 368 |
+
# The FIRST seed must NOT contain any words from the prompt!
|
| 369 |
non_overlapping_trigrams = []
|
| 370 |
for trigram in self.identity.gravity_centers[:30]:
|
| 371 |
trigram_words = set(trigram)
|
|
|
|
| 373 |
if not (trigram_words & prompt_words):
|
| 374 |
non_overlapping_trigrams.append(trigram)
|
| 375 |
|
| 376 |
+
# Step 4: Build internal seed - FIRST element is always from field
|
| 377 |
seed_parts = []
|
| 378 |
|
| 379 |
+
# FIRST SEED ELEMENT: corpus trigram WITHOUT prompt words
|
| 380 |
+
# This is the core of "no FIRST seed from human prompt"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
if non_overlapping_trigrams:
|
| 382 |
# Choose based on temperature + pulse
|
| 383 |
if temperature > 0.8 or pulse.arousal > 0.7:
|
|
|
|
| 394 |
seed_parts.append(' '.join(trigram))
|
| 395 |
break
|
| 396 |
else:
|
| 397 |
+
# Last resort: pure identity phrase (no prompt words)
|
| 398 |
seed_parts.append("the field responds")
|
| 399 |
+
else:
|
| 400 |
+
# Ultimate fallback
|
| 401 |
+
seed_parts.append("the field responds")
|
| 402 |
|
| 403 |
+
# IDENTITY FRAGMENT - can be added AFTER the first seed
|
| 404 |
+
# Identity fragments are who we ARE, so they don't need filtering
|
| 405 |
+
IDENTITY_ADD_PROB = 0.7 # 70% chance to add identity fragment
|
|
|
|
|
|
|
| 406 |
|
| 407 |
+
if random.random() < IDENTITY_ADD_PROB:
|
| 408 |
+
identity_fragment = random.choice(self.identity.fragments)
|
| 409 |
seed_parts.append(identity_fragment)
|
| 410 |
|
| 411 |
+
# PROMPT CONNECTION - add meaningful word from prompt AFTER internal seed
|
| 412 |
+
# This creates the link to reality - "Мама: Отстань!" is TO the child
|
| 413 |
+
# Uses module-level STOP_WORDS constant
|
| 414 |
+
|
| 415 |
+
# Find most meaningful word from prompt (longest non-stop word)
|
| 416 |
+
meaningful_words = [
|
| 417 |
+
w for w in prompt_words_list
|
| 418 |
+
if len(w) >= MIN_MEANINGFUL_WORD_LENGTH and w not in STOP_WORDS
|
| 419 |
+
]
|
| 420 |
+
|
| 421 |
+
# Add connection if we have meaningful words
|
| 422 |
+
CONNECTION_PROB = 0.8 # 80% chance to add connection
|
| 423 |
+
if meaningful_words and random.random() < CONNECTION_PROB:
|
| 424 |
+
# Prefer longer words (more specific/meaningful)
|
| 425 |
+
meaningful_words.sort(key=len, reverse=True)
|
| 426 |
+
connection_word = meaningful_words[0]
|
| 427 |
+
seed_parts.append(connection_word)
|
| 428 |
+
|
| 429 |
# Combine seed parts
|
| 430 |
seed_text = '. '.join(seed_parts)
|
| 431 |
|
|
|
|
| 434 |
|
| 435 |
# Ensure we have something
|
| 436 |
if not token_ids:
|
| 437 |
+
seed_text = "the field responds. haze resonates"
|
| 438 |
token_ids = self.vocab.encode(seed_text)
|
| 439 |
|
| 440 |
return token_ids, pulse, seed_text
|
|
|
|
| 454 |
user_prompt: What the user said
|
| 455 |
generated_response: What haze generated
|
| 456 |
"""
|
| 457 |
+
# Extract patterns from response (preserves contractions)
|
| 458 |
+
words = tokenize_words(generated_response)
|
| 459 |
|
| 460 |
# Add phrases as patterns
|
| 461 |
for i in range(len(words) - 2):
|