Spaces:

basilboy
/

SelfOrganisingText

Sleeping

App Files Files Community

basilboy commited on Sep 8, 2025

Commit

cd1fae2

verified ·

1 Parent(s): b9ca465

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -81

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from transformers import AutoTokenizer
 from safetensors.torch import load_file as load_sft
 from huggingface_hub import snapshot_download
-torch.set_default_dtype(torch.float32)
 # ===============================================
 # Default config (from your training notes)
@@ -62,9 +62,10 @@ class AttnBlock(nn.Module):
         return Qh2, Kh2
     def forward(self, x, rope, radius):
         if x.dtype != self.norm1.weight.dtype:
             x = x.to(self.norm1.weight.dtype)
         h = self.norm1(x)
         B, S, E = h.shape
         cos, sin = rope
@@ -130,6 +131,11 @@ class CNA(nn.Module):
             h = self.tok_emb(x)
         else:
             h = x
         B, S, E = h.shape
         hd = self.embed_dim // self.num_heads
         cos, sin = self._rope_seq(S, hd, h.device, h.dtype)
@@ -139,8 +145,7 @@ class CNA(nn.Module):
 # ===============================================
 # Helpers
-#
 def to_batch2(ids_like) -> torch.Tensor:
     """
     Normalize ids_like (list, [[...]], tensor) to int64 shape [1, S].
@@ -155,7 +160,6 @@ def to_batch2(ids_like) -> torch.Tensor:
         x = x.view(1, -1)               # fallback reshape
     return x
 def infer_expansion_factor_from_state(state, embed_dim):
     for key in ("blocks.0.mlp.0.weight", "blocks.0.mlp.2.weight"):
         if key in state:
@@ -250,36 +254,24 @@ def sample_from_logits(logits_row, temperature=1.0, current_token=None, exclude_
 # ===============================================
 # Weight loading (file / folder / HF Hub)
-# Handles weights-only .pt (state_dict) as well.
 # ===============================================
 DEFAULT_CKPT = os.environ.get("CKPT_PATH", "ckpt_latest.pt")
 DEFAULT_WEIGHTS_DIR = os.environ.get("WEIGHTS_DIR", "weights_latest")
 def _read_config_from_dict_or_infer(state, cfg):
-    # start from provided cfg merged over defaults
     merged = {**DEFAULT_CONF, **(cfg or {})}
-    # infer from weights if available
     if "tok_emb.weight" in state:
         merged["embed_dim"] = state["tok_emb.weight"].shape[1]
-    # infer num_blocks by scanning keys
     block_idxs = [int(m.group(1)) for k in state.keys() for m in [re.match(r"blocks\.(\d+)\.", k)] if m]
     if block_idxs:
         merged["num_blocks"] = max(block_idxs) + 1
-    # num_heads, radius, expansion_factor often aren't inferable; keep merged defaults
-    # expansion_factor can be inferred from MLP shapes if present
     if "blocks.0.mlp.0.weight" in state or "blocks.0.mlp.2.weight" in state:
         merged["expansion_factor"] = infer_expansion_factor_from_state(state, merged["embed_dim"])
-    # tokenizer
     if not merged.get("tokenizer_name"):
         merged["tokenizer_name"] = "gpt2"
     return merged
 def _is_state_dict(obj):
-    # A reasonable heuristic: a dict whose values are Tensors (and keys look like module names)
     if isinstance(obj, dict) and obj:
         sample_val = next(iter(obj.values()))
         return isinstance(sample_val, torch.Tensor)
@@ -287,14 +279,12 @@ def _is_state_dict(obj):
 def _load_state_from_pt(path: str):
     obj = torch.load(path, map_location="cpu")
-    # Case A: legacy payload with {"model": state_dict, "config": {...}}
     if isinstance(obj, dict) and "model" in obj and isinstance(obj["model"], dict):
         state = obj["model"]
         cfg = obj.get("config", {}) or {}
         if "tokenizer_name" in obj:
             cfg = {**cfg, "tokenizer_name": obj["tokenizer_name"]}
         return state, cfg
-    # Case B: weights-only state_dict (your case)
     if _is_state_dict(obj):
         return obj, {}
     raise ValueError(f"Unsupported .pt format at {path}: expected a state_dict or a payload with 'model'.")
@@ -402,8 +392,8 @@ def load_model(source: str):
             nn.init.zeros_(model.proj.bias)
     else:
         model.load_state_dict(state, strict=True)
-    # ✅ hard-cast ALL params & buffers to float32 (handles weights-only .pt that saved as float64)
     model = model.to(torch.float32)
     with torch.no_grad():
         for p in model.parameters():
@@ -412,7 +402,7 @@ def load_model(source: str):
         for _, buf in model.named_buffers():
             if buf.dtype.is_floating_point:
                 buf.data = buf.data.float()
     model.eval()
     return model, tokenizer, conf["radius"]
@@ -427,11 +417,10 @@ def _auto_default_source():
     for name in ["weights_latest.pt", "ckpt_latest.pt"]:
         if os.path.isfile(name):
             return name
-    # first .pt or .safetensors in repo root
     for f in sorted(os.listdir(".")):
         if f.endswith(".pt") or f.endswith(".safetensors"):
             return f
-    return "weights_latest.pt"  # sane default for your case
 def ensure_model(source_path_or_repo):
     src = source_path_or_repo or _auto_default_source()
@@ -467,33 +456,73 @@ def init_random(src, seqlen, seed):
     txt = decode(x[0], model_cache["tokenizer"])
     return x.tolist(), txt, f"Initialized random sequence (len={int(seqlen)})"
-def init_from_text(src, seqlen, text, seed, pad_mode):
-    ensure_model(src)
-    rnd = random.Random(seed)
-    x = to_fixed_len_ids(text or "", model_cache["tokenizer"], int(seqlen), pad_mode=pad_mode, rnd=rnd)
-    txt = decode(x[0], model_cache["tokenizer"])
-    return x.tolist(), txt, "Initialized from text"
-def append_text(src, state_ids, seqlen, text_to_append, seed):
-    ensure_model(src)
     tok = model_cache["tokenizer"]
-    rnd = random.Random(seed)
     S = int(seqlen)
-    if state_ids is None or len(state_ids) == 0:
-        x = to_fixed_len_ids(text_to_append or "", tok, S, pad_mode="random", rnd=rnd)
-    else:
-        x = to_batch2(state_ids)  # <-- normalize
-        extra = tok.encode(text_to_append or "", add_special_tokens=False)
-        x = torch.cat([x, torch.tensor(extra, dtype=torch.long).unsqueeze(0)], dim=1)
-        if x.shape[1] > S:
-            x = x[:, :S]
-        elif x.shape[1] < S:
-            need = S - x.shape[1]
-            V = tok.vocab_size
-            pad = torch.tensor([rnd.randrange(V) for _ in range(need)], dtype=torch.long).unsqueeze(0)
-            x = torch.cat([x, pad], dim=1)
-    txt = decode(x[0], tok)
-    return x.tolist(), txt, "Appended text and resized to target length"
 def apply_noise(src, state_ids, seqlen, indices_csv, add_left, add_right, seed):
     ensure_model(src)
@@ -503,29 +532,28 @@ def apply_noise(src, state_ids, seqlen, indices_csv, add_left, add_right, seed):
         V = tok.vocab_size
         base = torch.randint(0, V, (1, S))
     else:
-        base = to_batch2(state_ids)  # <-- normalize
     x = apply_noise_ops(base, tok, indices_csv, int(add_left or 0), int(add_right or 0), S, seed=seed)
     txt = decode(x[0], tok)
-    return x.tolist(), txt, "Applied noise brush / prepend / append"
 def step_once(src, state_ids, mode, temperature, exclude_current):
     ensure_model(src)
     tok = model_cache["tokenizer"]
     if state_ids is None or len(state_ids) == 0:
         return None, "", "No sequence to step — initialize first."
-    x = to_batch2(state_ids)  # <-- instead of torch.tensor(...).unsqueeze(0)
     x = step_strategy1(model_cache["model"], x, mode=mode, temperature=temperature, exclude_current=exclude_current)
     txt = decode(x[0], tok)
     return x.tolist(), txt, f"Stepped 1 iteration ({mode})"
 def live_denoise(src, state_ids, steps, snap_every, seed, mode, temperature, exclude_current):
     ensure_model(src)
     tok = model_cache["tokenizer"]
     if state_ids is None or len(state_ids) == 0:
         return
     random.seed(seed); torch.manual_seed(seed)
-    x = to_batch2(state_ids)  # <-- normalize
     total = int(steps); snap = max(1, int(snap_every))
     for t in range(1, total + 1):
         x = step_strategy1(model_cache["model"], x, mode=mode, temperature=temperature, exclude_current=exclude_current)
@@ -534,19 +562,22 @@ def live_denoise(src, state_ids, steps, snap_every, seed, mode, temperature, exc
             yield x.tolist(), txt, f"Live denoise… step {t}/{total} ({mode})"
 # ===============================================
-# UI
 # ===============================================
 with gr.Blocks(title="CNA — Interactive Denoising") as demo:
     gr.Markdown(
         """
         # CNA — Interactive Denoising (Strategy 1)
-        - **Weights source** can be: a `.pt` **weights-only state_dict** (e.g., `weights_latest.pt`), a folder of shards, or a **Hub repo id**.
-        - Update rule per step: **argmax** or **sample** (temperature + option to exclude current token).
-        - Tools: Random init, Init from text, Noise brush (select indices, prepend/append noise), Append text, Live denoise.
         """
     )
-    default_source = _auto_default_source()
     with gr.Row():
         src = gr.Textbox(value=default_source, label="Weights (file / folder / HF repo id)")
         seqlen = gr.Slider(10, 512, value=100, step=1, label="Sequence length (S)")
@@ -555,10 +586,10 @@ with gr.Blocks(title="CNA — Interactive Denoising") as demo:
     ids_state = gr.State(value=None)
     with gr.Row():
-        current_text = gr.Textbox(lines=8, label="Current text", interactive=False)
     status = gr.Markdown("Ready.")
-    gr.Markdown("## Mode 1 · Random → Denoise Live")
     with gr.Row():
         btn_random = gr.Button("Initialize Random")
         steps = gr.Slider(1, 2000, value=200, step=1, label="Denoise steps (N)")
@@ -571,32 +602,50 @@ with gr.Blocks(title="CNA — Interactive Denoising") as demo:
         btn_step_once = gr.Button("Step Once")
         btn_live = gr.Button("Denoise Live (streaming)")
-    gr.Markdown("## Mode 2 · Initialize From Your Text")
     with gr.Row():
-        init_text = gr.Textbox(lines=4, label="Initial text")
-    with gr.Row():
-        pad_mode = gr.Radio(choices=["random", "eos"], value="random", label="Pad mode (if text shorter than S)")
-        btn_init_text = gr.Button("Initialize From Text")
-    gr.Markdown("## Noise Brush · Select Positions + Prepend/Append Noise")
-    with gr.Row():
-        indices_csv = gr.Textbox(label="Positions to noise (e.g., 0, 5, 10-20)", placeholder="Leave empty to skip")
     with gr.Row():
         add_left = gr.Number(value=0, precision=0, label="Noise tokens to add at START")
         add_right = gr.Number(value=0, precision=0, label="Noise tokens to add at END")
-        btn_apply_noise = gr.Button("Apply Noise Brush / Prepend / Append")
-    gr.Markdown("## Append Text")
-    with gr.Row():
-        append_box = gr.Textbox(lines=3, label="Text to append")
-        btn_append = gr.Button("Append to Current Sequence")
-    # Wiring
     btn_random.click(init_random, [src, seqlen, seed], [ids_state, current_text, status])
-    btn_init_text.click(init_from_text, [src, seqlen, init_text, seed, pad_mode], [ids_state, current_text, status])
-    btn_apply_noise.click(apply_noise, [src, ids_state, seqlen, indices_csv, add_left, add_right, seed], [ids_state, current_text, status])
-    btn_append.click(append_text, [src, ids_state, seqlen, append_box, seed], [ids_state, current_text, status])
-    btn_step_once.click(step_once, [src, ids_state, update_mode, temperature, exclude_current], [ids_state, current_text, status])
-    btn_live.click(live_denoise, [src, ids_state, steps, snap_every, seed, update_mode, temperature, exclude_current], [ids_state, current_text, status], show_progress=True)
 demo.queue().launch()

 from safetensors.torch import load_file as load_sft
 from huggingface_hub import snapshot_download
+torch.set_default_dtype(torch.float32)
 # ===============================================
 # Default config (from your training notes)
         return Qh2, Kh2
     def forward(self, x, rope, radius):
+        # keep LN inputs & params same dtype
         if x.dtype != self.norm1.weight.dtype:
             x = x.to(self.norm1.weight.dtype)
         h = self.norm1(x)
         B, S, E = h.shape
         cos, sin = rope
             h = self.tok_emb(x)
         else:
             h = x
+        # ensure embeddings/activations dtype follows model dtype
+        target_dtype = next(self.parameters()).dtype
+        if h.dtype != target_dtype:
+            h = h.to(target_dtype)
         B, S, E = h.shape
         hd = self.embed_dim // self.num_heads
         cos, sin = self._rope_seq(S, hd, h.device, h.dtype)
 # ===============================================
 # Helpers
+# ===============================================
 def to_batch2(ids_like) -> torch.Tensor:
     """
     Normalize ids_like (list, [[...]], tensor) to int64 shape [1, S].
         x = x.view(1, -1)               # fallback reshape
     return x
 def infer_expansion_factor_from_state(state, embed_dim):
     for key in ("blocks.0.mlp.0.weight", "blocks.0.mlp.2.weight"):
         if key in state:
 # ===============================================
 # Weight loading (file / folder / HF Hub)
 # ===============================================
 DEFAULT_CKPT = os.environ.get("CKPT_PATH", "ckpt_latest.pt")
 DEFAULT_WEIGHTS_DIR = os.environ.get("WEIGHTS_DIR", "weights_latest")
 def _read_config_from_dict_or_infer(state, cfg):
     merged = {**DEFAULT_CONF, **(cfg or {})}
     if "tok_emb.weight" in state:
         merged["embed_dim"] = state["tok_emb.weight"].shape[1]
     block_idxs = [int(m.group(1)) for k in state.keys() for m in [re.match(r"blocks\.(\d+)\.", k)] if m]
     if block_idxs:
         merged["num_blocks"] = max(block_idxs) + 1
     if "blocks.0.mlp.0.weight" in state or "blocks.0.mlp.2.weight" in state:
         merged["expansion_factor"] = infer_expansion_factor_from_state(state, merged["embed_dim"])
     if not merged.get("tokenizer_name"):
         merged["tokenizer_name"] = "gpt2"
     return merged
 def _is_state_dict(obj):
     if isinstance(obj, dict) and obj:
         sample_val = next(iter(obj.values()))
         return isinstance(sample_val, torch.Tensor)
 def _load_state_from_pt(path: str):
     obj = torch.load(path, map_location="cpu")
     if isinstance(obj, dict) and "model" in obj and isinstance(obj["model"], dict):
         state = obj["model"]
         cfg = obj.get("config", {}) or {}
         if "tokenizer_name" in obj:
             cfg = {**cfg, "tokenizer_name": obj["tokenizer_name"]}
         return state, cfg
     if _is_state_dict(obj):
         return obj, {}
     raise ValueError(f"Unsupported .pt format at {path}: expected a state_dict or a payload with 'model'.")
             nn.init.zeros_(model.proj.bias)
     else:
         model.load_state_dict(state, strict=True)
+    # enforce float32 across params & buffers
     model = model.to(torch.float32)
     with torch.no_grad():
         for p in model.parameters():
         for _, buf in model.named_buffers():
             if buf.dtype.is_floating_point:
                 buf.data = buf.data.float()
     model.eval()
     return model, tokenizer, conf["radius"]
     for name in ["weights_latest.pt", "ckpt_latest.pt"]:
         if os.path.isfile(name):
             return name
     for f in sorted(os.listdir(".")):
         if f.endswith(".pt") or f.endswith(".safetensors"):
             return f
+    return "weights_latest.pt"
 def ensure_model(source_path_or_repo):
     src = source_path_or_repo or _auto_default_source()
     txt = decode(x[0], model_cache["tokenizer"])
     return x.tolist(), txt, f"Initialized random sequence (len={int(seqlen)})"
+def to_ranges(indices):
+    """Compress a sorted list of token indices into 'a-b' CSV."""
+    if not indices:
+        return ""
+    indices = sorted(set(indices))
+    ranges = []
+    start = prev = indices[0]
+    for i in indices[1:]:
+        if i == prev + 1:
+            prev = i
+        else:
+            ranges.append((start, prev))
+            start = prev = i
+    ranges.append((start, prev))
+    parts = [f"{a}-{b}" if a != b else f"{a}" for a, b in ranges]
+    return ", ".join(parts)
+def capture_selection(text, seqlen, current_ids, evt: gr.SelectData | None = None):
+    """
+    Map highlighted character span in `text` to token index ranges using tokenizer offsets.
+    Auto-fills the indices box so you can 'Noise Selection'.
+    """
+    ensure_model(None)
     tok = model_cache["tokenizer"]
+    if not text:
+        return gr.update(), "No text to select from."
+    # Try to read (start, end) from the event payload
+    start, end = None, None
+    if evt is not None:
+        try:
+            # gradio SelectData for Textbox exposes .index = (start_char, end_char)
+            start, end = evt.index
+        except Exception:
+            pass
+    # Fallback: nothing selected
+    if start is None or end is None or start == end:
+        return gr.update(), "No selection detected (drag to highlight)."
+    # Bound the indices defensively
+    start = max(0, min(len(text), int(start)))
+    end   = max(0, min(len(text), int(end)))
+    # Get per-token char offsets from the fast tokenizer
+    enc = tok(text, add_special_tokens=False, return_offsets_mapping=True)
+    offsets = enc["offset_mapping"]  # list of (s,e) per token
+    token_idxs = []
+    for i, (s, e) in enumerate(offsets):
+        if s is None or e is None:
+            continue
+        # overlap if token span intersects [start, end)
+        if max(s, start) < min(e, end):
+            token_idxs.append(i)
+    if not token_idxs:
+        return gr.update(), "Selection didn't hit any tokens (maybe whitespace)."
+    # Clip to current sequence length (so we don't index beyond S)
     S = int(seqlen)
+    token_idxs = [i for i in token_idxs if i < S]
+    if not token_idxs:
+        return gr.update(), "Selected span maps beyond current sequence length."
+    indices_csv = to_ranges(token_idxs)
+    return indices_csv, f"Selected chars [{start}:{end}) → tokens {indices_csv}"
 def apply_noise(src, state_ids, seqlen, indices_csv, add_left, add_right, seed):
     ensure_model(src)
         V = tok.vocab_size
         base = torch.randint(0, V, (1, S))
     else:
+        base = to_batch2(state_ids)
     x = apply_noise_ops(base, tok, indices_csv, int(add_left or 0), int(add_right or 0), S, seed=seed)
     txt = decode(x[0], tok)
+    return x.tolist(), txt, "Applied noise"
 def step_once(src, state_ids, mode, temperature, exclude_current):
     ensure_model(src)
     tok = model_cache["tokenizer"]
     if state_ids is None or len(state_ids) == 0:
         return None, "", "No sequence to step — initialize first."
+    x = to_batch2(state_ids)
     x = step_strategy1(model_cache["model"], x, mode=mode, temperature=temperature, exclude_current=exclude_current)
     txt = decode(x[0], tok)
     return x.tolist(), txt, f"Stepped 1 iteration ({mode})"
 def live_denoise(src, state_ids, steps, snap_every, seed, mode, temperature, exclude_current):
     ensure_model(src)
     tok = model_cache["tokenizer"]
     if state_ids is None or len(state_ids) == 0:
         return
     random.seed(seed); torch.manual_seed(seed)
+    x = to_batch2(state_ids)
     total = int(steps); snap = max(1, int(snap_every))
     for t in range(1, total + 1):
         x = step_strategy1(model_cache["model"], x, mode=mode, temperature=temperature, exclude_current=exclude_current)
             yield x.tolist(), txt, f"Live denoise… step {t}/{total} ({mode})"
 # ===============================================
+# UI (single mode)
 # ===============================================
 with gr.Blocks(title="CNA — Interactive Denoising") as demo:
     gr.Markdown(
         """
         # CNA — Interactive Denoising (Strategy 1)
+        - **Weights source**: `.pt` weights-only (e.g., `weights_latest.pt`), a folder of shards, or a **Hub repo id**.
+        - Update rule per step: **argmax** or **sample** (temperature + exclude current).
+        - Tools: Random init, **drag to select** in the text box → *Noise Selection*, manual indices, prepend/append noise, live denoise.
         """
     )
+    default_source = os.environ.get("WEIGHTS_SOURCE", None)
+    if default_source is None:
+        default_source = _auto_default_source()
     with gr.Row():
         src = gr.Textbox(value=default_source, label="Weights (file / folder / HF repo id)")
         seqlen = gr.Slider(10, 512, value=100, step=1, label="Sequence length (S)")
     ids_state = gr.State(value=None)
     with gr.Row():
+        current_text = gr.Textbox(lines=8, label="Current text", interactive=True)
     status = gr.Markdown("Ready.")
+    gr.Markdown("### Initialize & Denoise")
     with gr.Row():
         btn_random = gr.Button("Initialize Random")
         steps = gr.Slider(1, 2000, value=200, step=1, label="Denoise steps (N)")
         btn_step_once = gr.Button("Step Once")
         btn_live = gr.Button("Denoise Live (streaming)")
+    gr.Markdown("### Noise Selection or Manual Indices")
     with gr.Row():
+        indices_csv = gr.Textbox(label="Positions to noise (auto-filled from selection, or enter like `0, 5, 10-20`)")
     with gr.Row():
         add_left = gr.Number(value=0, precision=0, label="Noise tokens to add at START")
         add_right = gr.Number(value=0, precision=0, label="Noise tokens to add at END")
+        btn_noise_selection = gr.Button("Noise Selection")
+        btn_apply_noise = gr.Button("Apply Noise (from indices)")
+    # --- Wiring ---
     btn_random.click(init_random, [src, seqlen, seed], [ids_state, current_text, status])
+    # Select in text → auto-compute token indices into indices_csv
+    current_text.select(
+        capture_selection,
+        [current_text, seqlen, ids_state],
+        [indices_csv, status]
+    )
+    # “Noise Selection” just applies whatever is in indices_csv
+    btn_noise_selection.click(
+        apply_noise,
+        [src, ids_state, seqlen, indices_csv, 0, 0, seed],
+        [ids_state, current_text, status]
+    )
+    # Manual indices + prepend/append noise
+    btn_apply_noise.click(
+        apply_noise,
+        [src, ids_state, seqlen, indices_csv, add_left, add_right, seed],
+        [ids_state, current_text, status]
+    )
+    btn_step_once.click(
+        step_once,
+        [src, ids_state, update_mode, temperature, exclude_current],
+        [ids_state, current_text, status]
+    )
+    btn_live.click(
+        live_denoise,
+        [src, ids_state, steps, snap_every, seed, update_mode, temperature, exclude_current],
+        [ids_state, current_text, status],
+        show_progress=True
+    )
 demo.queue().launch()