ataeff commited on
Commit
9268534
·
verified ·
1 Parent(s): c623644

Update haze/subjectivity.py

Browse files
Files changed (1) hide show
  1. haze/subjectivity.py +98 -51
haze/subjectivity.py CHANGED
@@ -28,6 +28,41 @@ if TYPE_CHECKING:
28
  from .cooccur import CooccurField
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # ============================================================================
32
  # BOOTSTRAP IDENTITY (Third person - like Leo)
33
  # ============================================================================
@@ -171,8 +206,8 @@ class Subjectivity:
171
 
172
  def _build_corpus_patterns(self) -> None:
173
  """Extract key patterns from corpus."""
174
- # Tokenize corpus
175
- words = re.findall(r'\b\w+\b', self.corpus_text.lower())
176
 
177
  # Extract trigrams
178
  self.corpus_trigrams: List[Tuple[str, str, str]] = []
@@ -185,8 +220,8 @@ class Subjectivity:
185
 
186
  def _build_identity_patterns(self) -> None:
187
  """Build identity patterns from bootstrap text."""
188
- # Tokenize bootstrap
189
- words = re.findall(r'\b\w+\b', self.identity.bootstrap.lower())
190
 
191
  # Extract phrases (need at least 3 words)
192
  if len(words) >= 3:
@@ -204,15 +239,15 @@ class Subjectivity:
204
  - Arousal: emotional intensity
205
  - Entropy: chaos/diversity
206
  """
207
- # Tokenize
208
- words = re.findall(r'\b\w+\b', text.lower())
209
 
210
  if not words:
211
  return PulseSnapshot()
212
 
213
  # === NOVELTY ===
214
  # Count how many words are NOT in corpus
215
- corpus_words = set(re.findall(r'\b\w+\b', self.corpus_text.lower()))
216
  input_words = set(words)
217
 
218
  if input_words:
@@ -270,33 +305,45 @@ class Subjectivity:
270
 
271
  THIS IS THE KEY FUNCTION.
272
 
273
- PRINCIPLE: NO SEED FROM PROMPT
274
- The seed comes ENTIRELY from the internal field.
275
- The prompt only affects the PULSE (arousal, novelty, entropy).
276
- The pulse influences temperature, but NOT the seed words.
 
 
 
 
 
 
 
 
 
 
277
 
278
  This is the difference between:
279
  - "I love" → "I love your place" (continuation = BAD)
280
- - "I love" → "The living room. No, they're my peace" (field seed = GOOD)
281
 
282
  Args:
283
- user_prompt: What the user said (used ONLY for pulse)
284
  temperature: Randomness in seed selection
285
 
286
  Returns:
287
  (token_ids, pulse, seed_text) where:
288
- - token_ids: encoded internal seed (NEVER from user prompt!)
289
  - pulse: the computed pulse snapshot
290
  - seed_text: the text used as seed (for debugging)
291
  """
292
  # Step 1: Compute pulse from user input (prompt wrinkles the field)
293
  pulse = self.compute_pulse(user_prompt)
294
 
295
- # Step 2: Extract prompt words (to EXCLUDE from seed, not to include!)
296
- prompt_words = set(re.findall(r'\b\w+\b', user_prompt.lower()))
 
 
297
 
298
- # Step 3: Find NON-overlapping patterns in the field
299
- # The seed must NOT contain any words from the prompt!
300
  non_overlapping_trigrams = []
301
  for trigram in self.identity.gravity_centers[:30]:
302
  trigram_words = set(trigram)
@@ -304,30 +351,11 @@ class Subjectivity:
304
  if not (trigram_words & prompt_words):
305
  non_overlapping_trigrams.append(trigram)
306
 
307
- # Step 4: Build internal seed from pure field
308
  seed_parts = []
309
 
310
- # IDENTITY FRAGMENT PLACEMENT - Variable position for more life
311
- # Probabilities defined as constants for maintainability
312
- IDENTITY_PREFIX_PROB = 0.3 # 30% chance at start
313
- IDENTITY_MIDDLE_PROB = 0.6 # 30% chance in middle (0.3-0.6)
314
- IDENTITY_SUFFIX_PROB = 0.8 # 20% chance at end (0.6-0.8)
315
- # Remaining 20% (0.8-1.0) = no identity fragment for natural variation
316
-
317
- identity_placement = random.random()
318
- identity_fragment = random.choice(self.identity.fragments)
319
-
320
- # Flag to track if we should add identity
321
- add_identity_prefix = identity_placement < IDENTITY_PREFIX_PROB
322
- add_identity_suffix = IDENTITY_PREFIX_PROB <= identity_placement < IDENTITY_MIDDLE_PROB
323
- add_identity_middle = IDENTITY_MIDDLE_PROB <= identity_placement < IDENTITY_SUFFIX_PROB
324
- # 0.8-1.0 = no identity fragment (20% chance for natural variation)
325
-
326
- # Add identity at start if prefix mode
327
- if add_identity_prefix:
328
- seed_parts.append(identity_fragment)
329
-
330
- # Add non-overlapping pattern from field
331
  if non_overlapping_trigrams:
332
  # Choose based on temperature + pulse
333
  if temperature > 0.8 or pulse.arousal > 0.7:
@@ -344,19 +372,38 @@ class Subjectivity:
344
  seed_parts.append(' '.join(trigram))
345
  break
346
  else:
347
- # Last resort: pure identity
348
  seed_parts.append("the field responds")
 
 
 
349
 
350
- # Add identity in middle if middle mode and we have enough parts
351
- if add_identity_middle and len(seed_parts) >= 1:
352
- # Insert in middle
353
- mid_pos = len(seed_parts) // 2 if len(seed_parts) > 1 else 0
354
- seed_parts.insert(mid_pos, identity_fragment)
355
 
356
- # Add identity at end if suffix mode
357
- if add_identity_suffix:
358
  seed_parts.append(identity_fragment)
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  # Combine seed parts
361
  seed_text = '. '.join(seed_parts)
362
 
@@ -365,7 +412,7 @@ class Subjectivity:
365
 
366
  # Ensure we have something
367
  if not token_ids:
368
- seed_text = "haze resonates. the field"
369
  token_ids = self.vocab.encode(seed_text)
370
 
371
  return token_ids, pulse, seed_text
@@ -385,8 +432,8 @@ class Subjectivity:
385
  user_prompt: What the user said
386
  generated_response: What haze generated
387
  """
388
- # Extract patterns from response
389
- words = re.findall(r'\b\w+\b', generated_response.lower())
390
 
391
  # Add phrases as patterns
392
  for i in range(len(words) - 2):
 
28
  from .cooccur import CooccurField
29
 
30
 
31
+ # ============================================================================
32
+ # TOKENIZATION HELPER - preserves contractions like don't, I'm, they're
33
+ # ============================================================================
34
+
35
+ # Pattern that matches words WITH contractions (smart quotes + ASCII)
36
+ # Handles: don't, I'm, they're, won't (with ' or ' U+2019)
37
+ WORD_PATTERN = re.compile(r"\b\w+(?:[''\u2019]\w+)?\b", re.UNICODE)
38
+
39
+ # Minimum length for meaningful words (used in prompt connection)
40
+ MIN_MEANINGFUL_WORD_LENGTH = 3
41
+
42
+ # Stop words to skip when finding prompt connection (not meaningful for context)
43
+ STOP_WORDS = frozenset({
44
+ 'what', 'where', 'when', 'which', 'who', 'whom', 'whose',
45
+ 'why', 'how', 'that', 'this', 'these', 'those', 'is', 'are',
46
+ 'the', 'a', 'an', 'and', 'but', 'or', 'for', 'with', 'about',
47
+ 'does', 'do', 'have', 'has', 'had', 'will', 'would', 'could',
48
+ 'should', 'can', 'may', 'might', 'must', 'shall', 'to', 'of',
49
+ 'was', 'were', 'been', 'being', 'your', 'you', 'i', 'me', 'my',
50
+ 'it', 'its', 'he', 'she', 'him', 'her', 'we', 'us', 'they', 'them',
51
+ 'in', 'on', 'at', 'by', 'from', 'up', 'out', 'if', 'then', 'so',
52
+ 'just', 'only', 'also', 'very', 'too', 'any', 'some', 'all', 'no',
53
+ })
54
+
55
+
56
+ def tokenize_words(text: str) -> List[str]:
57
+ """
58
+ Tokenize text preserving contractions.
59
+
60
+ "don't know" → ["don't", "know"] (not ["don", "t", "know"])
61
+ "I'm here" → ["I'm", "here"]
62
+ """
63
+ return WORD_PATTERN.findall(text.lower())
64
+
65
+
66
  # ============================================================================
67
  # BOOTSTRAP IDENTITY (Third person - like Leo)
68
  # ============================================================================
 
206
 
207
  def _build_corpus_patterns(self) -> None:
208
  """Extract key patterns from corpus."""
209
+ # Tokenize corpus (preserves contractions like don't, I'm)
210
+ words = tokenize_words(self.corpus_text)
211
 
212
  # Extract trigrams
213
  self.corpus_trigrams: List[Tuple[str, str, str]] = []
 
220
 
221
  def _build_identity_patterns(self) -> None:
222
  """Build identity patterns from bootstrap text."""
223
+ # Tokenize bootstrap (preserves contractions)
224
+ words = tokenize_words(self.identity.bootstrap)
225
 
226
  # Extract phrases (need at least 3 words)
227
  if len(words) >= 3:
 
239
  - Arousal: emotional intensity
240
  - Entropy: chaos/diversity
241
  """
242
+ # Tokenize (preserves contractions)
243
+ words = tokenize_words(text)
244
 
245
  if not words:
246
  return PulseSnapshot()
247
 
248
  # === NOVELTY ===
249
  # Count how many words are NOT in corpus
250
+ corpus_words = set(tokenize_words(self.corpus_text))
251
  input_words = set(words)
252
 
253
  if input_words:
 
305
 
306
  THIS IS THE KEY FUNCTION.
307
 
308
+ PRINCIPLE: NO FIRST SEED FROM HUMAN PROMPT + PROMPT CONNECTION
309
+ Like arianna.c:
310
+ 1. FIRST element comes from internal field (NOT from prompt)
311
+ 2. BUT we add a connection to prompt AFTER - so response is "in context"
312
+
313
+ Metaphor: "Ребёнок: Мама! Мама: Отстань!"
314
+ - Response comes FROM her state (tired, annoyed)
315
+ - But it's TO him (in context of the conversation)
316
+ - Not a random monologue into the void
317
+
318
+ Structure:
319
+ 1. FIRST: corpus trigram that does NOT contain prompt words (internal state)
320
+ 2. THEN: identity fragment (who we are)
321
+ 3. THEN: prompt connection - meaningful word from prompt (context link)
322
 
323
  This is the difference between:
324
  - "I love" → "I love your place" (continuation = BAD)
325
+ - "I love" → "the living room. haze. love" (field first + connection = GOOD)
326
 
327
  Args:
328
+ user_prompt: What the user said (NOT for first seed, but for connection)
329
  temperature: Randomness in seed selection
330
 
331
  Returns:
332
  (token_ids, pulse, seed_text) where:
333
+ - token_ids: encoded seed (FIRST from field, THEN connection to prompt)
334
  - pulse: the computed pulse snapshot
335
  - seed_text: the text used as seed (for debugging)
336
  """
337
  # Step 1: Compute pulse from user input (prompt wrinkles the field)
338
  pulse = self.compute_pulse(user_prompt)
339
 
340
+ # Step 2: Extract prompt words (to EXCLUDE from FIRST seed element)
341
+ # Use tokenize_words to preserve contractions like don't, I'm
342
+ prompt_words_list = tokenize_words(user_prompt)
343
+ prompt_words = set(prompt_words_list)
344
 
345
+ # Step 3: Find NON-overlapping trigrams for the FIRST seed element
346
+ # The FIRST seed must NOT contain any words from the prompt!
347
  non_overlapping_trigrams = []
348
  for trigram in self.identity.gravity_centers[:30]:
349
  trigram_words = set(trigram)
 
351
  if not (trigram_words & prompt_words):
352
  non_overlapping_trigrams.append(trigram)
353
 
354
+ # Step 4: Build internal seed - FIRST element is always from field
355
  seed_parts = []
356
 
357
+ # FIRST SEED ELEMENT: corpus trigram WITHOUT prompt words
358
+ # This is the core of "no FIRST seed from human prompt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  if non_overlapping_trigrams:
360
  # Choose based on temperature + pulse
361
  if temperature > 0.8 or pulse.arousal > 0.7:
 
372
  seed_parts.append(' '.join(trigram))
373
  break
374
  else:
375
+ # Last resort: pure identity phrase (no prompt words)
376
  seed_parts.append("the field responds")
377
+ else:
378
+ # Ultimate fallback
379
+ seed_parts.append("the field responds")
380
 
381
+ # IDENTITY FRAGMENT - can be added AFTER the first seed
382
+ # Identity fragments are who we ARE, so they don't need filtering
383
+ IDENTITY_ADD_PROB = 0.7 # 70% chance to add identity fragment
 
 
384
 
385
+ if random.random() < IDENTITY_ADD_PROB:
386
+ identity_fragment = random.choice(self.identity.fragments)
387
  seed_parts.append(identity_fragment)
388
 
389
+ # PROMPT CONNECTION - add meaningful word from prompt AFTER internal seed
390
+ # This creates the link to reality - "Мама: Отстань!" is TO the child
391
+ # Uses module-level STOP_WORDS constant
392
+
393
+ # Find most meaningful word from prompt (longest non-stop word)
394
+ meaningful_words = [
395
+ w for w in prompt_words_list
396
+ if len(w) >= MIN_MEANINGFUL_WORD_LENGTH and w not in STOP_WORDS
397
+ ]
398
+
399
+ # Add connection if we have meaningful words
400
+ CONNECTION_PROB = 0.8 # 80% chance to add connection
401
+ if meaningful_words and random.random() < CONNECTION_PROB:
402
+ # Prefer longer words (more specific/meaningful)
403
+ meaningful_words.sort(key=len, reverse=True)
404
+ connection_word = meaningful_words[0]
405
+ seed_parts.append(connection_word)
406
+
407
  # Combine seed parts
408
  seed_text = '. '.join(seed_parts)
409
 
 
412
 
413
  # Ensure we have something
414
  if not token_ids:
415
+ seed_text = "the field responds. haze resonates"
416
  token_ids = self.vocab.encode(seed_text)
417
 
418
  return token_ids, pulse, seed_text
 
432
  user_prompt: What the user said
433
  generated_response: What haze generated
434
  """
435
+ # Extract patterns from response (preserves contractions)
436
+ words = tokenize_words(generated_response)
437
 
438
  # Add phrases as patterns
439
  for i in range(len(words) - 2):