trainman999 commited on
Commit
e151fed
Β·
verified Β·
1 Parent(s): 259aa9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -169
app.py CHANGED
@@ -1,16 +1,20 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- NeuroSymbolic V8.5 - Cohomology-Reduced + Calculated Word Age (AoA)
5
- + Double-Entendre Dot-Product With Shift
6
 
7
- Key upgrade (this revision): replace the single hash-dot-product boost with a
8
- two-pass ("double entendre") dot product:
9
 
10
- Pass 1: dot(embed(w2), embed(candidate))
11
- Pass 2: dot(embed(w2) + shift(w1), embed(candidate))
 
 
 
 
12
 
13
- Candidates that stay strong under BOTH frames get an agreement bonus.
 
 
14
  """
15
 
16
  from __future__ import annotations
@@ -44,18 +48,14 @@ TOPO_KEYWORDS = {
44
  "betti", "euler", "simplicial", "homotopy", "manifold", "morse", "sheaf"
45
  }
46
 
47
- # English vowels for syllable counting
48
  _VOWELS = set("aeiouy")
49
 
50
- # Common English phoneme bigrams (high-frequency β†’ imply easier words)
51
- # Derived from Brown corpus letter-bigram frequencies
52
  _COMMON_BIGRAMS: set = {
53
  "th", "he", "in", "er", "an", "re", "on", "en", "at", "ou",
54
  "ed", "nd", "to", "or", "ea", "ti", "es", "st", "ar", "nt",
55
  "is", "al", "it", "as", "ha", "et", "se", "ng", "le", "of",
56
  }
57
 
58
- # Derivational affixes that mark morphologically complex (later-acquired) words
59
  _LATINATE_PREFIXES = {
60
  "pre", "post", "anti", "auto", "bio", "geo", "hyper", "hypo",
61
  "inter", "intra", "micro", "macro", "meta", "mono", "multi",
@@ -69,7 +69,6 @@ _LATINATE_SUFFIXES = {
69
  "ation", "ization", "isation",
70
  }
71
 
72
- # Very early-acquired core vocabulary (prototype list, mean AoA < 4 yr)
73
  _EARLY_WORDS: Dict[str, float] = {
74
  "cat": 2.5, "dog": 2.5, "mom": 2.2, "dad": 2.2, "baby": 2.8,
75
  "ball": 2.6, "cup": 2.7, "eye": 2.4, "ear": 2.5, "nose": 2.6,
@@ -84,7 +83,69 @@ _EARLY_WORDS: Dict[str, float] = {
84
  }
85
 
86
  # ────────────────────────────────────────────────────────────────────────────
87
- # AoA DATASET (normed + calculated fallback)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  # ────────────────────────────────────────────────────────────────────────────
89
  AOA_DATASET_URL = (
90
  "https://norare.clld.org/contributions/Kuperman-2012-AoA/English-AoA-30K.csv"
@@ -94,10 +155,6 @@ AOA_COL_AOA = "AoA"
94
 
95
 
96
  def load_aoa_dataset(max_rows: int = 35_000) -> Dict[str, float]:
97
- """
98
- Load Kuperman 2012 AoA norms from CLLD (if reachable).
99
- Returns {word_lower: aoa_years}. Falls back to {} on failure.
100
- """
101
  try:
102
  df = pd.read_csv(AOA_DATASET_URL, nrows=max_rows)
103
  if AOA_COL_WORD not in df.columns or AOA_COL_AOA not in df.columns:
@@ -115,8 +172,7 @@ def load_aoa_dataset(max_rows: int = 35_000) -> Dict[str, float]:
115
  # WORD-AGE CALCULATOR
116
  # ────────────────────────────────────────────────────────────────────────────
117
  def _count_syllables(word: str) -> int:
118
- """Heuristic English syllable counter."""
119
- w = word.lower().rstrip("e") # silent final e
120
  count = sum(
121
  1
122
  for i, c in enumerate(w)
@@ -126,10 +182,6 @@ def _count_syllables(word: str) -> int:
126
 
127
 
128
  def _morpheme_complexity(word: str) -> float:
129
- """
130
- Returns a complexity score in [0, 1] based on recognisable derivational
131
- prefixes and suffixes. Each affix adds 0.25, capped at 1.0.
132
- """
133
  w = word.lower()
134
  score = 0.0
135
  for p in _LATINATE_PREFIXES:
@@ -138,16 +190,12 @@ def _morpheme_complexity(word: str) -> float:
138
  break
139
  for s in _LATINATE_SUFFIXES:
140
  if w.endswith(s) and len(w) > len(s) + 2:
141
- score += 0.25 * (1 + len(s) / 6) # longer suffixes β†’ more complex
142
  break
143
  return min(1.0, score)
144
 
145
 
146
  def _bigram_familiarity(word: str) -> float:
147
- """
148
- Fraction of consecutive letter pairs that appear in the common-bigram set.
149
- Higher β†’ more phonotactically familiar β†’ acquired earlier.
150
- """
151
  w = word.lower()
152
  if len(w) < 2:
153
  return 0.5
@@ -156,11 +204,6 @@ def _bigram_familiarity(word: str) -> float:
156
 
157
 
158
  def _ortho_neighborhood_size(word: str, aoa_dict: Dict[str, float]) -> int:
159
- """
160
- Approximate orthographic neighbourhood (Coltheart's N):
161
- count words in the AoA dict that differ by exactly one letter.
162
- Capped at 20 for speed.
163
- """
164
  w = word.lower()
165
  n = len(w)
166
  count = 0
@@ -180,50 +223,26 @@ def calculate_word_age(
180
  corpus_freq: Optional[Dict[str, int]] = None,
181
  corpus_total: int = 1,
182
  ) -> float:
183
- """
184
- Estimate age-of-acquisition for *word* in years.
185
-
186
- Priority:
187
- 1. Normed value from Kuperman 2012 (exact match)
188
- 2. Prototype entry in _EARLY_WORDS
189
- 3. Computed estimate from linguistic features
190
-
191
- Feature model (linear, calibrated to Kuperman distribution):
192
- AoA β‰ˆ intercept
193
- + Ξ²_len * (chars - 5)
194
- + Ξ²_syl * (syllables - 2)
195
- + Ξ²_morph * morpheme_complexity
196
- - Ξ²_big * bigram_familiarity
197
- - Ξ²_freq * log_rel_freq
198
- - Ξ²_neigh * log(1 + neighbourhood)
199
- """
200
  w = word.lower().strip()
201
  if not w or not w[0].isalpha():
202
  return 10.0
203
-
204
- # 1. Normed lookup
205
  if w in aoa:
206
  return aoa[w]
207
-
208
- # 2. Prototype list
209
  if w in _EARLY_WORDS:
210
  return _EARLY_WORDS[w]
211
 
212
- # ── Feature extraction ──────────────────────────────────────────────────
213
  n_chars = len(w)
214
  n_syl = _count_syllables(w)
215
  morph = _morpheme_complexity(w)
216
  bigram_f = _bigram_familiarity(w)
217
  neigh = _ortho_neighborhood_size(w, aoa)
218
 
219
- # Corpus frequency (log relative frequency, 0 if absent)
220
  if corpus_freq and w in corpus_freq:
221
  rel_freq = corpus_freq[w] / max(corpus_total, 1)
222
- log_freq = math.log(1 + rel_freq * 1_000_000) # per-million scale
223
  else:
224
  log_freq = 0.0
225
 
226
- # ── Linear model ────────────────────────────────────────────────────────
227
  intercept = 8.5
228
  Ξ²_len = 0.30
229
  Ξ²_syl = 0.55
@@ -251,23 +270,32 @@ def word_age(
251
  corpus_freq: Optional[Dict[str, int]] = None,
252
  corpus_total: int = 1,
253
  ) -> float:
254
- """Public accessor β€” uses calculate_word_age."""
255
  return calculate_word_age(token, aoa, corpus_freq, corpus_total)
256
 
257
 
258
  def age_continuity_boost(age1: float, age2: float, strength: float = 0.12) -> float:
259
- """Low-differentiation: small positive bias for similar (and earlier) ages."""
260
  d = abs(age1 - age2)
261
  early = min(age1, age2, 8.0) / 8.0
262
  return float(strength * math.exp(-d / 3.0) * early)
263
 
264
 
265
  # ────────────────────────────────────────────────────────────────────────────
266
- # COHOMOLOGY SCALARS
267
  # ────────────────────────────────────────────────────────────────────────────
268
  def topo_weight(token: str) -> float:
 
 
 
 
 
 
 
269
  tl = token.lower()
270
- return min(1.0, sum(0.4 for kw in TOPO_KEYWORDS if kw in tl))
 
 
 
 
271
 
272
 
273
  def semantic_scalar(t1: str, t2: str) -> float:
@@ -296,31 +324,45 @@ def centroid_boost(
296
 
297
 
298
  # ────────────────────────────────────────────────────────────────────────────
299
- # DOUBLE ENTENDRE HASH EMBEDDING (two dot products + shift)
300
  # ────────────────────────────────────────────────────────────────────────────
301
- class DoubleEntendreEmbedder:
302
  """
303
- Two-pass dot product:
304
 
305
- pass1 = dot(embed(w2), embed(c))
306
- pass2 = dot(embed(w2) + shift(w1), embed(c))
 
 
307
 
308
- combined = 0.5*(norm01(pass1) + norm01(pass2)) + agreement_bonus*min(norm01(pass1), norm01(pass2))
309
- """
 
310
 
311
- DIM = 4
 
 
 
 
312
 
313
- def embed(self, token: str) -> np.ndarray:
314
- h = int(hashlib.md5(token.encode("utf-8")).hexdigest(), 16)
315
- vec = np.array([(h >> (8 * i)) & 0xFF for i in range(self.DIM)], dtype=np.float32)
 
 
 
 
 
316
  s = float(vec.sum())
317
  return vec / (s + 1e-8)
318
 
319
- def shift_vector(self, token: str, magnitude: float = 0.15) -> np.ndarray:
320
- h = int(hashlib.sha256(token.encode("utf-8")).hexdigest(), 16)
321
- vec = np.array([(h >> (8 * i)) & 0xFF for i in range(self.DIM)], dtype=np.float32)
322
- vec = vec / (np.linalg.norm(vec) + 1e-8)
323
- return vec * float(magnitude)
 
 
324
 
325
  @staticmethod
326
  def _norm01(arr: np.ndarray) -> np.ndarray:
@@ -328,39 +370,72 @@ class DoubleEntendreEmbedder:
328
  mx = float(arr.max())
329
  return (arr - mn) / (mx - mn + 1e-12)
330
 
331
- def double_entendre_weights(
332
  self,
333
  w1: str,
334
  w2: str,
335
  candidates: List[str],
336
- shift_mag: float = 0.15,
337
- agreement_bonus: float = 0.30,
338
  ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
339
- anchor = self.embed(w2)
340
- shifted_anchor = anchor + self.shift_vector(w1, magnitude=shift_mag)
341
 
342
- # Keep the same "L1-ish" normalization style as embed()
343
- shifted_anchor = shifted_anchor / (shifted_anchor.sum() + 1e-8)
 
 
 
 
344
 
345
- cand_vecs = np.array([self.embed(c) for c in candidates], dtype=np.float32) # (N, DIM)
346
- pass1 = cand_vecs @ anchor
347
- pass2 = cand_vecs @ shifted_anchor
348
 
349
- p1 = self._norm01(pass1)
350
- p2 = self._norm01(pass2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
  de_score = np.minimum(p1, p2)
353
- combined = 0.5 * (p1 + p2) + float(agreement_bonus) * de_score
 
 
 
 
 
 
 
 
 
 
354
  combined = self._norm01(combined)
 
355
  return p1, p2, combined
356
 
357
 
 
 
 
 
358
  # ────────────────────────────────────────────────────────────────────────────
359
  # LANGUAGE MODEL
360
  # ────────────────────────────────────────────────────────────────────────────
361
  class NGramLM:
362
- """Trigram LM with high add_k for flat (low-differentiation) distributions."""
363
-
364
  def __init__(self, add_k: float = 1.5):
365
  self.add_k = float(add_k)
366
  self.uni: Dict[str, int] = {}
@@ -463,7 +538,7 @@ def detokenize(tokens: List[str]) -> str:
463
  @dataclass
464
  class CorpusState:
465
  lm: NGramLM
466
- embedder: DoubleEntendreEmbedder
467
  aoa: Dict[str, float]
468
  token_boost: Dict[str, float] = field(default_factory=dict)
469
  corpus_freq: Dict[str, int] = field(default_factory=dict)
@@ -474,7 +549,7 @@ def build_state(text: str, aoa: Dict[str, float]) -> CorpusState:
474
  tokens = tokenize(text)
475
  lm = NGramLM(add_k=1.5)
476
  lm.ingest(tokens)
477
- embedder = DoubleEntendreEmbedder()
478
 
479
  total = max(1, sum(lm.uni.values()))
480
  token_boost: Dict[str, float] = {}
@@ -500,20 +575,15 @@ def next_probs(
500
  w1: str,
501
  w2: str,
502
  temp: float = 1.2,
503
- boost_strength: float = 0.2, # kept (but no longer used for dot prod)
504
- de_strength: float = 0.18, # strength of double-entendre similarity
505
- de_shift_mag: float = 0.15, # shift magnitude for 2nd frame
506
- de_agreement_bonus: float = 0.30, # extra reward for agreement (min)
507
  ema_prev: Optional[torch.Tensor] = None,
508
  ema_cands: Optional[List[str]] = None,
509
  ) -> Tuple[List[str], torch.Tensor]:
510
  cands, base_probs = state.lm.next_dist(w1, w2)
511
 
512
- # Double-entendre dot-product weights
513
- _, _, de_combined = state.embedder.double_entendre_weights(
514
  w1=w1, w2=w2, candidates=cands,
515
- shift_mag=float(de_shift_mag),
516
- agreement_bonus=float(de_agreement_bonus),
517
  )
518
  de_t = torch.tensor(de_combined, dtype=torch.float32)
519
 
@@ -526,7 +596,6 @@ def next_probs(
526
  cb_t = torch.tensor(cb, dtype=torch.float32)
527
  tb = torch.tensor([state.token_boost.get(c, 0.0) for c in cands], dtype=torch.float32)
528
 
529
- # AoA continuity
530
  w2_age = word_age(state.aoa, w2, state.corpus_freq, state.corpus_total)
531
  age_arr = np.array(
532
  [age_continuity_boost(
@@ -537,12 +606,17 @@ def next_probs(
537
  )
538
  age_t = torch.tensor(age_arr, dtype=torch.float32)
539
 
540
- boosts = float(de_strength) * de_t + cb_t + 0.10 * tb + 0.15 * age_t
 
 
 
 
 
 
541
  logits = torch.log(base_probs.clamp_min(1e-12)) + boosts
542
  logits = logits / max(float(temp), 1e-6)
543
  probs = F.softmax(logits, dim=-1)
544
 
545
- # EMA smoothing
546
  if ema_prev is not None and ema_cands is not None:
547
  prev_idx = {w: i for i, w in enumerate(ema_cands)}
548
  aligned = torch.zeros_like(probs)
@@ -572,37 +646,37 @@ def generate(
572
  w2 = sw[-1] if sw else "concept"
573
 
574
  voices = [
575
- ("Positor", [
576
- "what", "how", "when", "why", "where", "whether", "imagine", "suppose", "consider", "define",
577
- "state", "pose", "query", "assert", "envision", "propose", "determine", "specify", "outline", "identify",
578
- "explore", "focus", "express", "declare", "suggest"
579
- ]),
580
- ("Analyzer", [
581
- "because", "therefore", "thus", "hence", "examine", "observe", "inspect", "compare", "contrast", "deduce",
582
- "infer", "evaluate", "scrutinize", "measure", "determine", "diagnose", "trace", "test", "quantify", "assess",
583
- "prove", "analyze", "dissect", "uncover", "establish"
584
- ]),
585
- ("Synthesizer", [
586
- "thus", "between", "integrates", "suggests", "combines", "merges", "connects", "unifies", "fuses", "blends",
587
- "resolves", "harmonizes", "links", "joins", "bridges", "reconciles", "aligns", "connects", "coalesces", "balances",
588
- "melds", "incorporates", "relates", "summarizes", "converges"
589
- ]),
590
- ("Reflector", [
591
- "ultimately", "reveals", "illuminates", "perhaps", "maybe", "indicates", "implies", "evokes", "signifies", "suggests",
592
- "contemplates", "meditates", "distills", "uncovers", "concludes", "infers", "recognizes", "appreciates", "ponders", "rethinks",
593
- "interprets", "acknowledges", "realizes", "wonders", "discerns"
594
- ]),
595
- ("Connector", [
596
- "relates", "links", "bridges", "connects", "associates", "correlates", "binds", "ties", "concatenates", "couples",
597
- "unites", "joins", "interweaves", "crosses", "maps", "compares", "contextualizes", "interrelates", "interlaces", "binds",
598
- "matches", "aggregates", "corresponds", "equates", "aligns"
599
- ]),
600
- ("Elaborator", [
601
- "further", "moreover", "extends", "develops", "expands", "deepens", "broadens", "amplifies", "details", "illustrates",
602
- "enhances", "supports", "enriches", "reiterates", "strengthens", "continues", "adds", "accentuates", "clarifies", "builds",
603
- "reinforces", "emphasizes", "substantiates", "heightens", "extends"
604
- ]),
605
- ][: max(1, int(num_voices))]
606
 
607
  result: List[Tuple[str, List[str]]] = []
608
  current_voice = 0
@@ -687,16 +761,12 @@ def load_corpus(
687
 
688
 
689
  # ────────────────────────────────────────────────────────────────────────────
690
- # AGE ANALYSIS HELPER (for stats panel)
691
  # ────────────────────────────────────────────────────────────────────────────
692
- def age_analysis(
693
  state: CorpusState,
694
  top_n: int = 10,
695
  ) -> str:
696
- """
697
- Produce a brief report on word-age distribution in the corpus vocabulary.
698
- Shows which words are youngest/oldest by calculated AoA.
699
- """
700
  alpha_vocab = [t for t in state.lm.vocab if t.isalpha() and t not in STOP_WORDS]
701
  if not alpha_vocab:
702
  return "No alpha vocabulary found."
@@ -706,7 +776,6 @@ def age_analysis(
706
  for t in alpha_vocab
707
  }
708
  sorted_ages = sorted(ages.items(), key=lambda x: x[1])
709
-
710
  youngest = sorted_ages[:top_n]
711
  oldest = sorted_ages[-top_n:][::-1]
712
 
@@ -717,6 +786,27 @@ def age_analysis(
717
  sum((v - mean_age) ** 2 for v in ages.values()) / max(1, len(ages))
718
  )
719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
  lines = [
721
  f"Alpha vocab: {len(alpha_vocab)} words",
722
  f" Normed (Kuperman): {normed}",
@@ -728,7 +818,14 @@ def age_analysis(
728
  "",
729
  f"Oldest {top_n} (latest acquired):",
730
  " " + ", ".join(f"{w}({a:.1f})" for w, a in oldest),
731
- ]
 
 
 
 
 
 
 
732
  return "\n".join(lines)
733
 
734
 
@@ -737,7 +834,7 @@ def age_analysis(
737
  # ────────────────────────────────────────────────────────────────────────────
738
  def run_session(
739
  use_hf, hf_dataset, hf_split, hf_max_rows,
740
- text_file, prompt, seed, max_tokens, num_voices, temp,
741
  progress=gr.Progress(),
742
  ):
743
  try:
@@ -750,8 +847,8 @@ def run_session(
750
  progress(0.40, desc="Building language model…")
751
  state = build_state(text, aoa)
752
 
753
- progress(0.60, desc="Analysing word ages…")
754
- age_stats = age_analysis(state)
755
 
756
  progress(0.70, desc="Generating narrative…")
757
  out_md = generate(
@@ -760,23 +857,34 @@ def run_session(
760
  seed=int(seed),
761
  num_voices=int(num_voices),
762
  temp=float(temp),
 
763
  )
764
 
765
  vocab_size = len(state.lm.vocab)
766
- topo_hits = [t for t in state.lm.vocab if topo_weight(t) > 0]
767
  normed = sum(1 for t in state.lm.vocab if t.isalpha() and t in aoa)
768
  alpha_total = sum(1 for t in state.lm.vocab if t.isalpha())
769
 
 
 
 
 
 
 
 
 
770
  stats = "\n".join([
771
  f"Vocab size: {vocab_size}",
772
  f"AoA normed (Kuperman exact): {normed}/{alpha_total}",
773
  f"AoA calculated (feature model): {alpha_total - normed}/{alpha_total}",
774
- f"Topo tokens: {len(topo_hits)} ({', '.join(topo_hits[:8])})",
775
- f"Temperature: {float(temp):.2f}",
776
- f"add_k: {state.lm.add_k:.2f}",
777
  f"Generated tokens: {int(max_tokens)}",
778
  "",
779
- "── Word-Age Distribution ──",
 
 
 
780
  age_stats,
781
  ])
782
  return out_md, stats
@@ -798,16 +906,21 @@ def toggle_hf(val):
798
 
799
  def build_app():
800
  with gr.Blocks(
801
- title="NeuroSymbolic V8.5 β€” Calculated Word Age + Double Entendre Dot Product",
802
  theme=gr.themes.Soft(),
803
  ) as demo:
804
  gr.Markdown(
805
- "# NeuroSymbolic V8.5 β€” Cohomology-Reduced + Calculated Word Age\n"
806
- "Word age (AoA) is now **calculated** for every token, not just looked up.\n\n"
807
- "**Priority:** normed Kuperman 2012 β†’ prototype list β†’ feature-based estimator \n"
808
- "**Features used:** word length, syllable count, morpheme complexity, "
809
- "bigram familiarity, corpus frequency, orthographic neighbourhood size.\n\n"
810
- "**New:** double-entendre dot-product boost (two dot products with a shifted frame)."
 
 
 
 
 
811
  )
812
 
813
  with gr.Row():
@@ -823,6 +936,7 @@ def build_app():
823
  max_tokens = gr.Slider(100, 800, value=300, step=50, label="Max Tokens")
824
  num_voices = gr.Slider(2, 6, value=3, step=1, label="Narrative Voices")
825
  temp = gr.Slider(0.8, 2.5, value=1.4, step=0.1, label="Temperature")
 
826
 
827
  with gr.Column(scale=2):
828
  prompt = gr.Textbox(
@@ -833,25 +947,26 @@ def build_app():
833
  btn = gr.Button("Generate", variant="primary", size="lg")
834
  gr.Markdown("## Generated Narrative (roles)")
835
  output_md = gr.Markdown(value="")
836
- output_stats = gr.Textbox(label="Stats + Word-Age Analysis", lines=20)
837
 
838
  btn.click(
839
  run_session,
840
  inputs=[use_hf, hf_dataset, hf_split, hf_max_rows,
841
- text_file, prompt, seed, max_tokens, num_voices, temp],
842
  outputs=[output_md, output_stats],
843
  )
844
 
845
  gr.Markdown(
846
- "### Notes\n"
847
- "- If the Kuperman CSV is unreachable, the model falls back to the "
848
- "feature-based estimator for *all* tokens (no flat 10.0).\n"
849
- "- Install: `pip install gradio datasets torch pandas numpy`\n"
850
- "- The word-age estimator is calibrated to the Kuperman distribution "
851
- "(mean β‰ˆ 8.5 yr, SD β‰ˆ 2.5 yr)."
 
852
  )
853
  return demo
854
 
855
 
856
  if __name__ == "__main__":
857
- build_app().queue().launch(share=False)
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ NeuroSymbolic V8.6 - Length-Dependent Topology Dot Products
 
5
 
6
+ Key upgrade (this revision): topology dot products now scale with token length.
 
7
 
8
+ Length-Dependent Topology:
9
+ - Embedding dimension DIM scales with word length (longer words β†’ higher-dim space)
10
+ - topo_weight() scales with char-length, rewarding morphologically rich tokens
11
+ - shift_magnitude scales with length (longer words get stronger frame-shift)
12
+ - agreement_bonus scales with length (longer words need stronger cross-frame consensus)
13
+ - A length-weighted topology kernel modulates the final dot-product combination
14
 
15
+ This means short/simple words (cat, dog) use compact 2-4D embeddings with mild
16
+ topology influence, while long/complex words (cohomology, reconstruction) use
17
+ up to 12D embeddings with much stronger topological modulation.
18
  """
19
 
20
  from __future__ import annotations
 
48
  "betti", "euler", "simplicial", "homotopy", "manifold", "morse", "sheaf"
49
  }
50
 
 
51
  _VOWELS = set("aeiouy")
52
 
 
 
53
  _COMMON_BIGRAMS: set = {
54
  "th", "he", "in", "er", "an", "re", "on", "en", "at", "ou",
55
  "ed", "nd", "to", "or", "ea", "ti", "es", "st", "ar", "nt",
56
  "is", "al", "it", "as", "ha", "et", "se", "ng", "le", "of",
57
  }
58
 
 
59
  _LATINATE_PREFIXES = {
60
  "pre", "post", "anti", "auto", "bio", "geo", "hyper", "hypo",
61
  "inter", "intra", "micro", "macro", "meta", "mono", "multi",
 
69
  "ation", "ization", "isation",
70
  }
71
 
 
72
  _EARLY_WORDS: Dict[str, float] = {
73
  "cat": 2.5, "dog": 2.5, "mom": 2.2, "dad": 2.2, "baby": 2.8,
74
  "ball": 2.6, "cup": 2.7, "eye": 2.4, "ear": 2.5, "nose": 2.6,
 
83
  }
84
 
85
  # ────────────────────────────────────────────────────────────────────────────
86
+ # LENGTH-DEPENDENT TOPOLOGY PARAMETERS
87
+ # ────────────────────────────────────────────────────────────────────────────
88
+
89
+ # DIM for embedding: scales from DIM_MIN to DIM_MAX based on word length
90
+ DIM_MIN = 2 # shortest words (len ≀ 2)
91
+ DIM_MAX = 12 # longest words (len β‰₯ LENGTH_CEIL)
92
+ LENGTH_CEIL = 14 # word length at which DIM saturates at DIM_MAX
93
+ SHIFT_MAG_MIN = 0.05 # shift magnitude for short words
94
+ SHIFT_MAG_MAX = 0.35 # shift magnitude for long words
95
+ AGREEMENT_BONUS_MIN = 0.10 # agreement bonus for short words
96
+ AGREEMENT_BONUS_MAX = 0.60 # agreement bonus for long words
97
+
98
+
99
+ def length_alpha(word: str, ceil: int = LENGTH_CEIL) -> float:
100
+ """
101
+ Normalised length factor α ∈ [0, 1].
102
+ Ξ± = 0 for very short words, 1 for words at/beyond LENGTH_CEIL chars.
103
+ Uses a smooth sigmoid-like curve so medium-length words are partially scaled.
104
+ """
105
+ n = len(word.strip())
106
+ # Soft sigmoid centered at ceil/2
107
+ mid = ceil / 2.0
108
+ return float(1.0 / (1.0 + math.exp(-0.55 * (n - mid))))
109
+
110
+
111
+ def length_dim(word: str) -> int:
112
+ """
113
+ Embedding dimension for a word, scaled by length.
114
+ Short words β†’ DIM_MIN; long words β†’ DIM_MAX.
115
+ Always even (for cleaner hash decomposition).
116
+ """
117
+ Ξ± = length_alpha(word)
118
+ raw = DIM_MIN + Ξ± * (DIM_MAX - DIM_MIN)
119
+ return max(DIM_MIN, int(round(raw / 2) * 2)) # round to nearest even
120
+
121
+
122
+ def length_shift_mag(word: str) -> float:
123
+ """Shift magnitude scaled by word length."""
124
+ Ξ± = length_alpha(word)
125
+ return SHIFT_MAG_MIN + Ξ± * (SHIFT_MAG_MAX - SHIFT_MAG_MIN)
126
+
127
+
128
+ def length_agreement_bonus(word: str) -> float:
129
+ """Agreement bonus scaled by word length."""
130
+ Ξ± = length_alpha(word)
131
+ return AGREEMENT_BONUS_MIN + Ξ± * (AGREEMENT_BONUS_MAX - AGREEMENT_BONUS_MIN)
132
+
133
+
134
+ def length_topo_kernel(word: str) -> float:
135
+ """
136
+ A length-dependent weight for how strongly topology modulates the dot product.
137
+ Short words: topology has little influence.
138
+ Long words: topology strongly modulates the combined score.
139
+
140
+ Returns a multiplier in [0.05, 1.0].
141
+ """
142
+ Ξ± = length_alpha(word)
143
+ # Topology kernel: exponential ramp
144
+ return float(0.05 + 0.95 * (Ξ± ** 1.5))
145
+
146
+
147
+ # ────────────────────────────────────────────────────────────────────────────
148
+ # AoA DATASET
149
  # ────────────────────────────────────────────────────────────────────────────
150
  AOA_DATASET_URL = (
151
  "https://norare.clld.org/contributions/Kuperman-2012-AoA/English-AoA-30K.csv"
 
155
 
156
 
157
  def load_aoa_dataset(max_rows: int = 35_000) -> Dict[str, float]:
 
 
 
 
158
  try:
159
  df = pd.read_csv(AOA_DATASET_URL, nrows=max_rows)
160
  if AOA_COL_WORD not in df.columns or AOA_COL_AOA not in df.columns:
 
172
  # WORD-AGE CALCULATOR
173
  # ────────────────────────────────────────────────────────────────────────────
174
  def _count_syllables(word: str) -> int:
175
+ w = word.lower().rstrip("e")
 
176
  count = sum(
177
  1
178
  for i, c in enumerate(w)
 
182
 
183
 
184
  def _morpheme_complexity(word: str) -> float:
 
 
 
 
185
  w = word.lower()
186
  score = 0.0
187
  for p in _LATINATE_PREFIXES:
 
190
  break
191
  for s in _LATINATE_SUFFIXES:
192
  if w.endswith(s) and len(w) > len(s) + 2:
193
+ score += 0.25 * (1 + len(s) / 6)
194
  break
195
  return min(1.0, score)
196
 
197
 
198
  def _bigram_familiarity(word: str) -> float:
 
 
 
 
199
  w = word.lower()
200
  if len(w) < 2:
201
  return 0.5
 
204
 
205
 
206
  def _ortho_neighborhood_size(word: str, aoa_dict: Dict[str, float]) -> int:
 
 
 
 
 
207
  w = word.lower()
208
  n = len(w)
209
  count = 0
 
223
  corpus_freq: Optional[Dict[str, int]] = None,
224
  corpus_total: int = 1,
225
  ) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  w = word.lower().strip()
227
  if not w or not w[0].isalpha():
228
  return 10.0
 
 
229
  if w in aoa:
230
  return aoa[w]
 
 
231
  if w in _EARLY_WORDS:
232
  return _EARLY_WORDS[w]
233
 
 
234
  n_chars = len(w)
235
  n_syl = _count_syllables(w)
236
  morph = _morpheme_complexity(w)
237
  bigram_f = _bigram_familiarity(w)
238
  neigh = _ortho_neighborhood_size(w, aoa)
239
 
 
240
  if corpus_freq and w in corpus_freq:
241
  rel_freq = corpus_freq[w] / max(corpus_total, 1)
242
+ log_freq = math.log(1 + rel_freq * 1_000_000)
243
  else:
244
  log_freq = 0.0
245
 
 
246
  intercept = 8.5
247
  Ξ²_len = 0.30
248
  Ξ²_syl = 0.55
 
270
  corpus_freq: Optional[Dict[str, int]] = None,
271
  corpus_total: int = 1,
272
  ) -> float:
 
273
  return calculate_word_age(token, aoa, corpus_freq, corpus_total)
274
 
275
 
276
  def age_continuity_boost(age1: float, age2: float, strength: float = 0.12) -> float:
 
277
  d = abs(age1 - age2)
278
  early = min(age1, age2, 8.0) / 8.0
279
  return float(strength * math.exp(-d / 3.0) * early)
280
 
281
 
282
  # ────────────────────────────────────────────────────────────────────────────
283
+ # COHOMOLOGY SCALARS β€” now length-dependent
284
  # ────────────────────────────────────────────────────────────────────────────
285
  def topo_weight(token: str) -> float:
286
+ """
287
+ Topology weight, now length-dependent.
288
+
289
+ Base keyword score is amplified by the token's length-topology kernel:
290
+ longer tokens are more likely to carry topological meaning (e.g. "cohomology"
291
+ vs "co"), so we scale the raw keyword hit by length_topo_kernel().
292
+ """
293
  tl = token.lower()
294
+ base = min(1.0, sum(0.4 for kw in TOPO_KEYWORDS if kw in tl))
295
+ # Even without a keyword hit, longer words get a mild topology presence
296
+ length_presence = 0.05 * length_alpha(token)
297
+ raw = base + length_presence
298
+ return float(min(1.0, raw * length_topo_kernel(token)))
299
 
300
 
301
  def semantic_scalar(t1: str, t2: str) -> float:
 
324
 
325
 
326
  # ────────────────────────────────────────────────────────────────────────────
327
+ # LENGTH-DEPENDENT DOUBLE ENTENDRE EMBEDDER
328
  # ────────────────────────────────────────────────────────────────────────────
329
+ class LengthDependentEmbedder:
330
  """
331
+ Length-dependent double-entendre dot product.
332
 
333
+ For each (w1, w2, candidate) triple:
334
+ - DIM is determined by the CANDIDATE's length (the thing being scored)
335
+ - shift_mag and agreement_bonus scale with the ANCHOR word (w2) length
336
+ - A length-topology kernel modulates the final combined score
337
 
338
+ Two passes:
339
+ pass1 = dot(embed(w2, dim), embed(c, dim))
340
+ pass2 = dot(embed(w2, dim) + shift(w1, dim, mag), embed(c, dim))
341
 
342
+ combined = topo_kernel(c) * [0.5*(norm01(p1)+norm01(p2)) + bonus*min(p1,p2)]
343
+ + (1 - topo_kernel(c)) * 0.5*(norm01(p1)+norm01(p2))
344
+
345
+ This means topology modulation only kicks in for longer/more complex candidates.
346
+ """
347
 
348
+ def embed(self, token: str, dim: Optional[int] = None) -> np.ndarray:
349
+ """Hash-based embedding in `dim`-dimensional space (length-dependent if dim=None)."""
350
+ d = dim if dim is not None else length_dim(token)
351
+ # Use MD5 for the first 16 bytes, SHA256 for more if needed
352
+ raw_bytes = hashlib.sha256(token.encode("utf-8")).digest() # 32 bytes
353
+ # Repeat to fill d bytes
354
+ repeated = (raw_bytes * ((d // 32) + 2))[:d]
355
+ vec = np.array(list(repeated), dtype=np.float32)
356
  s = float(vec.sum())
357
  return vec / (s + 1e-8)
358
 
359
+ def shift_vector(self, token: str, dim: int, magnitude: float) -> np.ndarray:
360
+ """Length-aware shift: magnitude already pre-scaled by caller."""
361
+ raw_bytes = hashlib.md5(token.encode("utf-8")).digest() # 16 bytes
362
+ repeated = (raw_bytes * ((dim // 16) + 2))[:dim]
363
+ vec = np.array(list(repeated), dtype=np.float32)
364
+ norm = np.linalg.norm(vec)
365
+ return (vec / (norm + 1e-8)) * magnitude
366
 
367
  @staticmethod
368
  def _norm01(arr: np.ndarray) -> np.ndarray:
 
370
  mx = float(arr.max())
371
  return (arr - mn) / (mx - mn + 1e-12)
372
 
373
+ def length_dependent_weights(
374
  self,
375
  w1: str,
376
  w2: str,
377
  candidates: List[str],
 
 
378
  ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
379
+ """
380
+ Compute length-dependent double-entendre weights for each candidate.
381
 
382
+ Returns (pass1_norm, pass2_norm, combined) all in [0,1].
383
+ """
384
+ N = len(candidates)
385
+ pass1_raw = np.zeros(N, dtype=np.float32)
386
+ pass2_raw = np.zeros(N, dtype=np.float32)
387
+ topo_kernels = np.zeros(N, dtype=np.float32)
388
 
389
+ # Anchor parameters depend on w2's length
390
+ anchor_shift_mag = length_shift_mag(w2)
391
+ anchor_agree_bonus = length_agreement_bonus(w2)
392
 
393
+ for i, c in enumerate(candidates):
394
+ # Each candidate uses its own length-dependent DIM
395
+ dim = length_dim(c)
396
+
397
+ # Embed w2 and candidate in the candidate's dimensional space
398
+ e_w2 = self.embed(w2, dim=dim)
399
+ e_c = self.embed(c, dim=dim)
400
+
401
+ # Shift uses w1 in the same dim
402
+ shift = self.shift_vector(w1, dim=dim, magnitude=anchor_shift_mag)
403
+ e_w2_shifted = e_w2 + shift
404
+ norm_s = float(e_w2_shifted.sum())
405
+ e_w2_shifted = e_w2_shifted / (abs(norm_s) + 1e-8)
406
+
407
+ pass1_raw[i] = float(np.dot(e_w2, e_c))
408
+ pass2_raw[i] = float(np.dot(e_w2_shifted, e_c))
409
+ topo_kernels[i] = length_topo_kernel(c)
410
+
411
+ p1 = self._norm01(pass1_raw)
412
+ p2 = self._norm01(pass2_raw)
413
 
414
  de_score = np.minimum(p1, p2)
415
+
416
+ # Base combination (same for all candidates)
417
+ base_combined = 0.5 * (p1 + p2)
418
+
419
+ # Agreement bonus scales with w2 length (anchor-level parameter)
420
+ agreement_part = float(anchor_agree_bonus) * de_score
421
+
422
+ # Topology kernel gates how much the agreement bonus applies
423
+ # Short candidates: topology kernel β‰ˆ 0 β†’ agreement bonus suppressed
424
+ # Long candidates: topology kernel β‰ˆ 1 β†’ full agreement bonus
425
+ combined = base_combined + topo_kernels * agreement_part
426
  combined = self._norm01(combined)
427
+
428
  return p1, p2, combined
429
 
430
 
431
+ # Keep the old name as an alias for backwards compatibility
432
+ DoubleEntendreEmbedder = LengthDependentEmbedder
433
+
434
+
435
  # ────────────────────────────────────────────────────────────────────────────
436
  # LANGUAGE MODEL
437
  # ────────────────────────────────────────────────────────────────────────────
438
  class NGramLM:
 
 
439
  def __init__(self, add_k: float = 1.5):
440
  self.add_k = float(add_k)
441
  self.uni: Dict[str, int] = {}
 
538
  @dataclass
539
  class CorpusState:
540
  lm: NGramLM
541
+ embedder: LengthDependentEmbedder
542
  aoa: Dict[str, float]
543
  token_boost: Dict[str, float] = field(default_factory=dict)
544
  corpus_freq: Dict[str, int] = field(default_factory=dict)
 
549
  tokens = tokenize(text)
550
  lm = NGramLM(add_k=1.5)
551
  lm.ingest(tokens)
552
+ embedder = LengthDependentEmbedder()
553
 
554
  total = max(1, sum(lm.uni.values()))
555
  token_boost: Dict[str, float] = {}
 
575
  w1: str,
576
  w2: str,
577
  temp: float = 1.2,
578
+ de_strength: float = 0.18,
 
 
 
579
  ema_prev: Optional[torch.Tensor] = None,
580
  ema_cands: Optional[List[str]] = None,
581
  ) -> Tuple[List[str], torch.Tensor]:
582
  cands, base_probs = state.lm.next_dist(w1, w2)
583
 
584
+ # Length-dependent double-entendre dot-product weights
585
+ _, _, de_combined = state.embedder.length_dependent_weights(
586
  w1=w1, w2=w2, candidates=cands,
 
 
587
  )
588
  de_t = torch.tensor(de_combined, dtype=torch.float32)
589
 
 
596
  cb_t = torch.tensor(cb, dtype=torch.float32)
597
  tb = torch.tensor([state.token_boost.get(c, 0.0) for c in cands], dtype=torch.float32)
598
 
 
599
  w2_age = word_age(state.aoa, w2, state.corpus_freq, state.corpus_total)
600
  age_arr = np.array(
601
  [age_continuity_boost(
 
606
  )
607
  age_t = torch.tensor(age_arr, dtype=torch.float32)
608
 
609
+ # Length-dependent topology also modulates the centroid boost
610
+ topo_kernels = torch.tensor(
611
+ [length_topo_kernel(c) for c in cands], dtype=torch.float32
612
+ )
613
+ topo_cb = cb_t * (0.5 + 0.5 * topo_kernels) # short words: 0.5x; long: 1x boost
614
+
615
+ boosts = float(de_strength) * de_t + topo_cb + 0.10 * tb + 0.15 * age_t
616
  logits = torch.log(base_probs.clamp_min(1e-12)) + boosts
617
  logits = logits / max(float(temp), 1e-6)
618
  probs = F.softmax(logits, dim=-1)
619
 
 
620
  if ema_prev is not None and ema_cands is not None:
621
  prev_idx = {w: i for i, w in enumerate(ema_cands)}
622
  aligned = torch.zeros_like(probs)
 
646
  w2 = sw[-1] if sw else "concept"
647
 
648
  voices = [
649
+ ("Positor", [
650
+ "what", "how", "when", "why", "where", "whether", "imagine", "suppose", "consider", "define",
651
+ "state", "pose", "query", "assert", "envision", "propose", "determine", "specify", "outline", "identify",
652
+ "explore", "focus", "express", "declare", "suggest"
653
+ ]),
654
+ ("Analyzer", [
655
+ "because", "therefore", "thus", "hence", "examine", "observe", "inspect", "compare", "contrast", "deduce",
656
+ "infer", "evaluate", "scrutinize", "measure", "determine", "diagnose", "trace", "test", "quantify", "assess",
657
+ "prove", "analyze", "dissect", "uncover", "establish"
658
+ ]),
659
+ ("Synthesizer", [
660
+ "thus", "between", "integrates", "suggests", "combines", "merges", "connects", "unifies", "fuses", "blends",
661
+ "resolves", "harmonizes", "links", "joins", "bridges", "reconciles", "aligns", "connects", "coalesces", "balances",
662
+ "melds", "incorporates", "relates", "summarizes", "converges"
663
+ ]),
664
+ ("Reflector", [
665
+ "ultimately", "reveals", "illuminates", "perhaps", "maybe", "indicates", "implies", "evokes", "signifies", "suggests",
666
+ "contemplates", "meditates", "distills", "uncovers", "concludes", "infers", "recognizes", "appreciates", "ponders", "rethinks",
667
+ "interprets", "acknowledges", "realizes", "wonders", "discerns"
668
+ ]),
669
+ ("Connector", [
670
+ "relates", "links", "bridges", "connects", "associates", "correlates", "binds", "ties", "concatenates", "couples",
671
+ "unites", "joins", "interweaves", "crosses", "maps", "compares", "contextualizes", "interrelates", "interlaces", "binds",
672
+ "matches", "aggregates", "corresponds", "equates", "aligns"
673
+ ]),
674
+ ("Elaborator", [
675
+ "further", "moreover", "extends", "develops", "expands", "deepens", "broadens", "amplifies", "details", "illustrates",
676
+ "enhances", "supports", "enriches", "reiterates", "strengthens", "continues", "adds", "accentuates", "clarifies", "builds",
677
+ "reinforces", "emphasizes", "substantiates", "heightens", "extends"
678
+ ]),
679
+ ][: max(1, int(num_voices))]
680
 
681
  result: List[Tuple[str, List[str]]] = []
682
  current_voice = 0
 
761
 
762
 
763
  # ────────────────────────────────────────────────────────────────────────────
764
+ # AGE + LENGTH ANALYSIS HELPER
765
  # ────────────────────────────────────────────────────────────────────────────
766
+ def age_and_length_analysis(
767
  state: CorpusState,
768
  top_n: int = 10,
769
  ) -> str:
 
 
 
 
770
  alpha_vocab = [t for t in state.lm.vocab if t.isalpha() and t not in STOP_WORDS]
771
  if not alpha_vocab:
772
  return "No alpha vocabulary found."
 
776
  for t in alpha_vocab
777
  }
778
  sorted_ages = sorted(ages.items(), key=lambda x: x[1])
 
779
  youngest = sorted_ages[:top_n]
780
  oldest = sorted_ages[-top_n:][::-1]
781
 
 
786
  sum((v - mean_age) ** 2 for v in ages.values()) / max(1, len(ages))
787
  )
788
 
789
+ # Length-dependent topology analysis
790
+ topo_by_len: Dict[int, List[Tuple[str, float]]] = {}
791
+ for t in alpha_vocab:
792
+ d = length_dim(t)
793
+ tw = topo_weight(t)
794
+ Ξ± = length_alpha(t)
795
+ kern = length_topo_kernel(t)
796
+ if d not in topo_by_len:
797
+ topo_by_len[d] = []
798
+ topo_by_len[d].append((t, tw * kern))
799
+
800
+ dim_summary_lines = []
801
+ for d in sorted(topo_by_len.keys()):
802
+ entries = topo_by_len[d]
803
+ avg_tw = sum(v for _, v in entries) / max(1, len(entries))
804
+ top_ex = sorted(entries, key=lambda x: -x[1])[:3]
805
+ ex_str = ", ".join(f"{w}({v:.2f})" for w, v in top_ex)
806
+ dim_summary_lines.append(
807
+ f" DIM={d:2d} | {len(entries):4d} words | mean topoΓ—kernel={avg_tw:.3f} | top: {ex_str}"
808
+ )
809
+
810
  lines = [
811
  f"Alpha vocab: {len(alpha_vocab)} words",
812
  f" Normed (Kuperman): {normed}",
 
818
  "",
819
  f"Oldest {top_n} (latest acquired):",
820
  " " + ", ".join(f"{w}({a:.1f})" for w, a in oldest),
821
+ "",
822
+ "── Length-Dependent Topology Dot-Product Summary ──",
823
+ f" DIM range: {DIM_MIN}–{DIM_MAX} | length ceil: {LENGTH_CEIL}",
824
+ f" shift_mag range: {SHIFT_MAG_MIN:.2f}–{SHIFT_MAG_MAX:.2f}",
825
+ f" agreement_bonus range: {AGREEMENT_BONUS_MIN:.2f}–{AGREEMENT_BONUS_MAX:.2f}",
826
+ "",
827
+ ] + dim_summary_lines
828
+
829
  return "\n".join(lines)
830
 
831
 
 
834
  # ────────────────────────────────────────────────────────────────────────────
835
  def run_session(
836
  use_hf, hf_dataset, hf_split, hf_max_rows,
837
+ text_file, prompt, seed, max_tokens, num_voices, temp, tokens_per_turn,
838
  progress=gr.Progress(),
839
  ):
840
  try:
 
847
  progress(0.40, desc="Building language model…")
848
  state = build_state(text, aoa)
849
 
850
+ progress(0.60, desc="Analysing word ages + length topology…")
851
+ age_stats = age_and_length_analysis(state)
852
 
853
  progress(0.70, desc="Generating narrative…")
854
  out_md = generate(
 
857
  seed=int(seed),
858
  num_voices=int(num_voices),
859
  temp=float(temp),
860
+ tokens_per_turn=int(tokens_per_turn),
861
  )
862
 
863
  vocab_size = len(state.lm.vocab)
864
+ topo_hits = [t for t in state.lm.vocab if topo_weight(t) > 0.05]
865
  normed = sum(1 for t in state.lm.vocab if t.isalpha() and t in aoa)
866
  alpha_total = sum(1 for t in state.lm.vocab if t.isalpha())
867
 
868
+ # Sample length-dim distribution
869
+ alpha_vocab = [t for t in state.lm.vocab if t.isalpha()]
870
+ dim_counts: Dict[int, int] = {}
871
+ for t in alpha_vocab:
872
+ d = length_dim(t)
873
+ dim_counts[d] = dim_counts.get(d, 0) + 1
874
+ dim_dist = " " + " ".join(f"DIM{d}:{n}" for d, n in sorted(dim_counts.items()))
875
+
876
  stats = "\n".join([
877
  f"Vocab size: {vocab_size}",
878
  f"AoA normed (Kuperman exact): {normed}/{alpha_total}",
879
  f"AoA calculated (feature model): {alpha_total - normed}/{alpha_total}",
880
+ f"Topo tokens (length-weighted): {len(topo_hits)}",
881
+ f"Temperature: {float(temp):.2f} | add_k: {state.lm.add_k:.2f}",
 
882
  f"Generated tokens: {int(max_tokens)}",
883
  "",
884
+ "── Lengthβ†’DIM distribution ──",
885
+ dim_dist,
886
+ "",
887
+ "── Word-Age + Length-Topology Analysis ──",
888
  age_stats,
889
  ])
890
  return out_md, stats
 
906
 
907
  def build_app():
908
  with gr.Blocks(
909
+ title="NeuroSymbolic V8.6 β€” Length-Dependent Topology Dot Products",
910
  theme=gr.themes.Soft(),
911
  ) as demo:
912
  gr.Markdown(
913
+ "# NeuroSymbolic V8.6 β€” Length-Dependent Topology Dot Products\n"
914
+ "The topology dot-product now **scales with word/token length**.\n\n"
915
+ "| Parameter | Short words | Long words |\n"
916
+ "|-----------|------------|------------|\n"
917
+ "| Embedding DIM | 2–4 | 8–12 |\n"
918
+ "| Shift magnitude | 0.05 | 0.35 |\n"
919
+ "| Agreement bonus | 0.10 | 0.60 |\n"
920
+ "| Topo kernel gate | ~0.05 | ~1.0 |\n\n"
921
+ "**Effect:** Short words (cat, big) have compact, lightly modulated dot products. "
922
+ "Long words (cohomology, reconstruction) use high-dimensional embeddings with strong "
923
+ "topological agreement gating and large frame-shift vectors."
924
  )
925
 
926
  with gr.Row():
 
936
  max_tokens = gr.Slider(100, 800, value=300, step=50, label="Max Tokens")
937
  num_voices = gr.Slider(2, 6, value=3, step=1, label="Narrative Voices")
938
  temp = gr.Slider(0.8, 2.5, value=1.4, step=0.1, label="Temperature")
939
+ tokens_per_turn = gr.Slider(20, 200, value=170, step=10, label="Tokens per Role")
940
 
941
  with gr.Column(scale=2):
942
  prompt = gr.Textbox(
 
947
  btn = gr.Button("Generate", variant="primary", size="lg")
948
  gr.Markdown("## Generated Narrative (roles)")
949
  output_md = gr.Markdown(value="")
950
+ output_stats = gr.Textbox(label="Stats + Length-Topology Analysis", lines=25)
951
 
952
  btn.click(
953
  run_session,
954
  inputs=[use_hf, hf_dataset, hf_split, hf_max_rows,
955
+ text_file, prompt, seed, max_tokens, num_voices, temp, tokens_per_turn],
956
  outputs=[output_md, output_stats],
957
  )
958
 
959
  gr.Markdown(
960
+ "### Design Notes\n"
961
+ "- `length_alpha(word)` β†’ smooth sigmoid in [0,1] centered at half of `LENGTH_CEIL`\n"
962
+ "- `length_dim(word)` β†’ embedding dimension 2–12 (always even, rounded)\n"
963
+ "- `length_topo_kernel(word)` β†’ gates agreement bonus: short=0.05, longβ‰ˆ1.0\n"
964
+ "- `topo_weight(word)` β†’ keyword hit Γ— length_topo_kernel (length-amplified)\n"
965
+ "- `centroid_boost` modulated by topo_kernel: short words get 0.5Γ— boost\n"
966
+ "- Install: `pip install gradio datasets torch pandas numpy`"
967
  )
968
  return demo
969
 
970
 
971
  if __name__ == "__main__":
972
+ build_app().queue().launch(share=False)