ataeff commited on
Commit
f29f94e
·
verified ·
1 Parent(s): 0f90eb0

Update haze/subjectivity.py

Browse files
Files changed (1) hide show
  1. haze/subjectivity.py +121 -52
haze/subjectivity.py CHANGED
@@ -28,6 +28,58 @@ if TYPE_CHECKING:
28
  from .cooccur import CooccurField
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # ============================================================================
32
  # BOOTSTRAP IDENTITY (Third person - like Leo)
33
  # ============================================================================
@@ -171,8 +223,8 @@ class Subjectivity:
171
 
172
  def _build_corpus_patterns(self) -> None:
173
  """Extract key patterns from corpus."""
174
- # Tokenize corpus
175
- words = re.findall(r'\b\w+\b', self.corpus_text.lower())
176
 
177
  # Extract trigrams
178
  self.corpus_trigrams: List[Tuple[str, str, str]] = []
@@ -180,13 +232,18 @@ class Subjectivity:
180
  self.corpus_trigrams.append((words[i], words[i+1], words[i+2]))
181
 
182
  # Find most common trigrams as "gravity centers"
 
183
  trigram_counts = Counter(self.corpus_trigrams)
184
- self.identity.gravity_centers = [t for t, _ in trigram_counts.most_common(50)]
 
 
 
 
185
 
186
  def _build_identity_patterns(self) -> None:
187
  """Build identity patterns from bootstrap text."""
188
- # Tokenize bootstrap
189
- words = re.findall(r'\b\w+\b', self.identity.bootstrap.lower())
190
 
191
  # Extract phrases (need at least 3 words)
192
  if len(words) >= 3:
@@ -204,15 +261,15 @@ class Subjectivity:
204
  - Arousal: emotional intensity
205
  - Entropy: chaos/diversity
206
  """
207
- # Tokenize
208
- words = re.findall(r'\b\w+\b', text.lower())
209
 
210
  if not words:
211
  return PulseSnapshot()
212
 
213
  # === NOVELTY ===
214
  # Count how many words are NOT in corpus
215
- corpus_words = set(re.findall(r'\b\w+\b', self.corpus_text.lower()))
216
  input_words = set(words)
217
 
218
  if input_words:
@@ -270,33 +327,45 @@ class Subjectivity:
270
 
271
  THIS IS THE KEY FUNCTION.
272
 
273
- PRINCIPLE: NO SEED FROM PROMPT
274
- The seed comes ENTIRELY from the internal field.
275
- The prompt only affects the PULSE (arousal, novelty, entropy).
276
- The pulse influences temperature, but NOT the seed words.
 
 
 
 
 
 
 
 
 
 
277
 
278
  This is the difference between:
279
  - "I love" → "I love your place" (continuation = BAD)
280
- - "I love" → "The living room. No, they're my peace" (field seed = GOOD)
281
 
282
  Args:
283
- user_prompt: What the user said (used ONLY for pulse)
284
  temperature: Randomness in seed selection
285
 
286
  Returns:
287
  (token_ids, pulse, seed_text) where:
288
- - token_ids: encoded internal seed (NEVER from user prompt!)
289
  - pulse: the computed pulse snapshot
290
  - seed_text: the text used as seed (for debugging)
291
  """
292
  # Step 1: Compute pulse from user input (prompt wrinkles the field)
293
  pulse = self.compute_pulse(user_prompt)
294
 
295
- # Step 2: Extract prompt words (to EXCLUDE from seed, not to include!)
296
- prompt_words = set(re.findall(r'\b\w+\b', user_prompt.lower()))
 
 
297
 
298
- # Step 3: Find NON-overlapping patterns in the field
299
- # The seed must NOT contain any words from the prompt!
300
  non_overlapping_trigrams = []
301
  for trigram in self.identity.gravity_centers[:30]:
302
  trigram_words = set(trigram)
@@ -304,30 +373,11 @@ class Subjectivity:
304
  if not (trigram_words & prompt_words):
305
  non_overlapping_trigrams.append(trigram)
306
 
307
- # Step 4: Build internal seed from pure field
308
  seed_parts = []
309
 
310
- # IDENTITY FRAGMENT PLACEMENT - Variable position for more life
311
- # Probabilities defined as constants for maintainability
312
- IDENTITY_PREFIX_PROB = 0.3 # 30% chance at start
313
- IDENTITY_MIDDLE_PROB = 0.6 # 30% chance in middle (0.3-0.6)
314
- IDENTITY_SUFFIX_PROB = 0.8 # 20% chance at end (0.6-0.8)
315
- # Remaining 20% (0.8-1.0) = no identity fragment for natural variation
316
-
317
- identity_placement = random.random()
318
- identity_fragment = random.choice(self.identity.fragments)
319
-
320
- # Flag to track if we should add identity
321
- add_identity_prefix = identity_placement < IDENTITY_PREFIX_PROB
322
- add_identity_suffix = IDENTITY_PREFIX_PROB <= identity_placement < IDENTITY_MIDDLE_PROB
323
- add_identity_middle = IDENTITY_MIDDLE_PROB <= identity_placement < IDENTITY_SUFFIX_PROB
324
- # 0.8-1.0 = no identity fragment (20% chance for natural variation)
325
-
326
- # Add identity at start if prefix mode
327
- if add_identity_prefix:
328
- seed_parts.append(identity_fragment)
329
-
330
- # Add non-overlapping pattern from field
331
  if non_overlapping_trigrams:
332
  # Choose based on temperature + pulse
333
  if temperature > 0.8 or pulse.arousal > 0.7:
@@ -344,19 +394,38 @@ class Subjectivity:
344
  seed_parts.append(' '.join(trigram))
345
  break
346
  else:
347
- # Last resort: pure identity
348
  seed_parts.append("the field responds")
 
 
 
349
 
350
- # Add identity in middle if middle mode and we have enough parts
351
- if add_identity_middle and len(seed_parts) >= 1:
352
- # Insert in middle
353
- mid_pos = len(seed_parts) // 2 if len(seed_parts) > 1 else 0
354
- seed_parts.insert(mid_pos, identity_fragment)
355
 
356
- # Add identity at end if suffix mode
357
- if add_identity_suffix:
358
  seed_parts.append(identity_fragment)
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  # Combine seed parts
361
  seed_text = '. '.join(seed_parts)
362
 
@@ -365,7 +434,7 @@ class Subjectivity:
365
 
366
  # Ensure we have something
367
  if not token_ids:
368
- seed_text = "haze resonates. the field"
369
  token_ids = self.vocab.encode(seed_text)
370
 
371
  return token_ids, pulse, seed_text
@@ -385,8 +454,8 @@ class Subjectivity:
385
  user_prompt: What the user said
386
  generated_response: What haze generated
387
  """
388
- # Extract patterns from response
389
- words = re.findall(r'\b\w+\b', generated_response.lower())
390
 
391
  # Add phrases as patterns
392
  for i in range(len(words) - 2):
 
28
  from .cooccur import CooccurField
29
 
30
 
31
+ # ============================================================================
32
+ # TOKENIZATION HELPER - preserves contractions like don't, I'm, they're
33
+ # ============================================================================
34
+
35
+ # Pattern that matches words WITH contractions (smart quotes + ASCII)
36
+ # Handles: don't, I'm, they're, won't (with ' or ' U+2019)
37
+ WORD_PATTERN = re.compile(r"\b\w+(?:[''\u2019]\w+)?\b", re.UNICODE)
38
+
39
+ # Minimum length for meaningful words (used in prompt connection)
40
+ MIN_MEANINGFUL_WORD_LENGTH = 3
41
+
42
+ # Stop words to skip when finding prompt connection (not meaningful for context)
43
+ STOP_WORDS = frozenset({
44
+ 'what', 'where', 'when', 'which', 'who', 'whom', 'whose',
45
+ 'why', 'how', 'that', 'this', 'these', 'those', 'is', 'are',
46
+ 'the', 'a', 'an', 'and', 'but', 'or', 'for', 'with', 'about',
47
+ 'does', 'do', 'have', 'has', 'had', 'will', 'would', 'could',
48
+ 'should', 'can', 'may', 'might', 'must', 'shall', 'to', 'of',
49
+ 'was', 'were', 'been', 'being', 'your', 'you', 'i', 'me', 'my',
50
+ 'it', 'its', 'he', 'she', 'him', 'her', 'we', 'us', 'they', 'them',
51
+ 'in', 'on', 'at', 'by', 'from', 'up', 'out', 'if', 'then', 'so',
52
+ 'just', 'only', 'also', 'very', 'too', 'any', 'some', 'all', 'no',
53
+ })
54
+
55
+ # Blacklist of mundane/generic phrases that shouldn't dominate gravity centers
56
+ # These are phrases that appear frequently in corpus but don't contribute to identity
57
+ # Format: set of 3-tuples of lowercased words
58
+ MUNDANE_TRIGRAMS = frozenset({
59
+ # Location phrases - too generic, not identity-defining
60
+ ('the', 'living', 'room'),
61
+ ('in', 'the', 'living'),
62
+ ('to', 'the', 'living'),
63
+ ('the', 'storage', 'room'),
64
+ ('in', 'the', 'storage'),
65
+ ('to', 'the', 'storage'),
66
+ })
67
+
68
+ # Configuration for gravity center selection
69
+ GRAVITY_CENTER_POOL_SIZE = 100 # Initial pool of top trigrams to filter from
70
+ GRAVITY_CENTER_FINAL_SIZE = 50 # Final number of gravity centers to keep
71
+
72
+
73
+ def tokenize_words(text: str) -> List[str]:
74
+ """
75
+ Tokenize text preserving contractions.
76
+
77
+ "don't know" → ["don't", "know"] (not ["don", "t", "know"])
78
+ "I'm here" → ["I'm", "here"]
79
+ """
80
+ return WORD_PATTERN.findall(text.lower())
81
+
82
+
83
  # ============================================================================
84
  # BOOTSTRAP IDENTITY (Third person - like Leo)
85
  # ============================================================================
 
223
 
224
  def _build_corpus_patterns(self) -> None:
225
  """Extract key patterns from corpus."""
226
+ # Tokenize corpus (preserves contractions like don't, I'm)
227
+ words = tokenize_words(self.corpus_text)
228
 
229
  # Extract trigrams
230
  self.corpus_trigrams: List[Tuple[str, str, str]] = []
 
232
  self.corpus_trigrams.append((words[i], words[i+1], words[i+2]))
233
 
234
  # Find most common trigrams as "gravity centers"
235
+ # Filter out mundane/generic phrases that don't contribute to identity
236
  trigram_counts = Counter(self.corpus_trigrams)
237
+ filtered_trigrams = [
238
+ t for t, _ in trigram_counts.most_common(GRAVITY_CENTER_POOL_SIZE)
239
+ if t not in MUNDANE_TRIGRAMS
240
+ ]
241
+ self.identity.gravity_centers = filtered_trigrams[:GRAVITY_CENTER_FINAL_SIZE]
242
 
243
  def _build_identity_patterns(self) -> None:
244
  """Build identity patterns from bootstrap text."""
245
+ # Tokenize bootstrap (preserves contractions)
246
+ words = tokenize_words(self.identity.bootstrap)
247
 
248
  # Extract phrases (need at least 3 words)
249
  if len(words) >= 3:
 
261
  - Arousal: emotional intensity
262
  - Entropy: chaos/diversity
263
  """
264
+ # Tokenize (preserves contractions)
265
+ words = tokenize_words(text)
266
 
267
  if not words:
268
  return PulseSnapshot()
269
 
270
  # === NOVELTY ===
271
  # Count how many words are NOT in corpus
272
+ corpus_words = set(tokenize_words(self.corpus_text))
273
  input_words = set(words)
274
 
275
  if input_words:
 
327
 
328
  THIS IS THE KEY FUNCTION.
329
 
330
+ PRINCIPLE: NO FIRST SEED FROM HUMAN PROMPT + PROMPT CONNECTION
331
+ Like arianna.c:
332
+ 1. FIRST element comes from internal field (NOT from prompt)
333
+ 2. BUT we add a connection to prompt AFTER - so response is "in context"
334
+
335
+ Metaphor: "Ребёнок: Мама! Мама: Отстань!"
336
+ - Response comes FROM her state (tired, annoyed)
337
+ - But it's TO him (in context of the conversation)
338
+ - Not a random monologue into the void
339
+
340
+ Structure:
341
+ 1. FIRST: corpus trigram that does NOT contain prompt words (internal state)
342
+ 2. THEN: identity fragment (who we are)
343
+ 3. THEN: prompt connection - meaningful word from prompt (context link)
344
 
345
  This is the difference between:
346
  - "I love" → "I love your place" (continuation = BAD)
347
+ - "I love" → "the living room. haze. love" (field first + connection = GOOD)
348
 
349
  Args:
350
+ user_prompt: What the user said (NOT for first seed, but for connection)
351
  temperature: Randomness in seed selection
352
 
353
  Returns:
354
  (token_ids, pulse, seed_text) where:
355
+ - token_ids: encoded seed (FIRST from field, THEN connection to prompt)
356
  - pulse: the computed pulse snapshot
357
  - seed_text: the text used as seed (for debugging)
358
  """
359
  # Step 1: Compute pulse from user input (prompt wrinkles the field)
360
  pulse = self.compute_pulse(user_prompt)
361
 
362
+ # Step 2: Extract prompt words (to EXCLUDE from FIRST seed element)
363
+ # Use tokenize_words to preserve contractions like don't, I'm
364
+ prompt_words_list = tokenize_words(user_prompt)
365
+ prompt_words = set(prompt_words_list)
366
 
367
+ # Step 3: Find NON-overlapping trigrams for the FIRST seed element
368
+ # The FIRST seed must NOT contain any words from the prompt!
369
  non_overlapping_trigrams = []
370
  for trigram in self.identity.gravity_centers[:30]:
371
  trigram_words = set(trigram)
 
373
  if not (trigram_words & prompt_words):
374
  non_overlapping_trigrams.append(trigram)
375
 
376
+ # Step 4: Build internal seed - FIRST element is always from field
377
  seed_parts = []
378
 
379
+ # FIRST SEED ELEMENT: corpus trigram WITHOUT prompt words
380
+ # This is the core of "no FIRST seed from human prompt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  if non_overlapping_trigrams:
382
  # Choose based on temperature + pulse
383
  if temperature > 0.8 or pulse.arousal > 0.7:
 
394
  seed_parts.append(' '.join(trigram))
395
  break
396
  else:
397
+ # Last resort: pure identity phrase (no prompt words)
398
  seed_parts.append("the field responds")
399
+ else:
400
+ # Ultimate fallback
401
+ seed_parts.append("the field responds")
402
 
403
+ # IDENTITY FRAGMENT - can be added AFTER the first seed
404
+ # Identity fragments are who we ARE, so they don't need filtering
405
+ IDENTITY_ADD_PROB = 0.7 # 70% chance to add identity fragment
 
 
406
 
407
+ if random.random() < IDENTITY_ADD_PROB:
408
+ identity_fragment = random.choice(self.identity.fragments)
409
  seed_parts.append(identity_fragment)
410
 
411
+ # PROMPT CONNECTION - add meaningful word from prompt AFTER internal seed
412
+ # This creates the link to reality - "Мама: Отстань!" is TO the child
413
+ # Uses module-level STOP_WORDS constant
414
+
415
+ # Find most meaningful word from prompt (longest non-stop word)
416
+ meaningful_words = [
417
+ w for w in prompt_words_list
418
+ if len(w) >= MIN_MEANINGFUL_WORD_LENGTH and w not in STOP_WORDS
419
+ ]
420
+
421
+ # Add connection if we have meaningful words
422
+ CONNECTION_PROB = 0.8 # 80% chance to add connection
423
+ if meaningful_words and random.random() < CONNECTION_PROB:
424
+ # Prefer longer words (more specific/meaningful)
425
+ meaningful_words.sort(key=len, reverse=True)
426
+ connection_word = meaningful_words[0]
427
+ seed_parts.append(connection_word)
428
+
429
  # Combine seed parts
430
  seed_text = '. '.join(seed_parts)
431
 
 
434
 
435
  # Ensure we have something
436
  if not token_ids:
437
+ seed_text = "the field responds. haze resonates"
438
  token_ids = self.vocab.encode(seed_text)
439
 
440
  return token_ids, pulse, seed_text
 
454
  user_prompt: What the user said
455
  generated_response: What haze generated
456
  """
457
+ # Extract patterns from response (preserves contractions)
458
+ words = tokenize_words(generated_response)
459
 
460
  # Add phrases as patterns
461
  for i in range(len(words) - 2):