scripts

Browse files

Files changed (4) hide show

abstract_model.py +296 -0
create_initialized_abstract.py +28 -0
eval_simple.py +206 -0
test_soft_embedding_with_trigger.py +164 -0

abstract_model.py ADDED Viewed

	@@ -0,0 +1,296 @@

+#!/usr/bin/env python3
+"""
+Abstract Model - Robust Inference with Forbidden Token Masking (Fixed Dimensions)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import json
+import importlib
+import inspect
+from pathlib import Path
+class AbstractModel(nn.Module):
+    def __init__(self, sft_model_path, device=None):
+        super().__init__()
+        self.sft_model_path = sft_model_path
+        if device is None:
+            self._target_device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+        else:
+            self._target_device = device
+        print(f"Initializing AbstractModel on target device: {self._target_device}")
+        self.tokenizer = AutoTokenizer.from_pretrained(sft_model_path, trust_remote_code=True)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        print(f"Loading SFT model from {sft_model_path}...")
+        sft_model = AutoModelForCausalLM.from_pretrained(
+            sft_model_path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            attn_implementation="sdpa",
+        )
+        sft_model = sft_model.to(self._target_device)
+        sft_model.eval()
+        self.model_backbone = sft_model.model
+        self.lm_head = sft_model.lm_head
+        self.embed_layer = sft_model.get_input_embeddings()
+        self.config = sft_model.config
+        self.hidden_size = sft_model.config.hidden_size
+        self.vocab_size = sft_model.config.vocab_size
+        self.continuous_head = nn.Linear(self.hidden_size, self.vocab_size, bias=False)
+        self.continuous_embed_layer = nn.Embedding(self.vocab_size, self.hidden_size)
+        self.continuous_head = self.continuous_head.to(self._target_device).to(torch.bfloat16)
+        self.continuous_embed_layer = self.continuous_embed_layer.to(self._target_device).to(torch.bfloat16)
+        self.think_id = self.tokenizer.encode("<think>", add_special_tokens=False)[0]
+        self.end_think_id = self.tokenizer.encode("</think>", add_special_tokens=False)[0]
+        forbidden_strings = [
+            "<|end_of_text|>", "<|start_of_role|>", "<|end_of_role|>",
+            "<|eot_id|>", "<|start_header_id|>", "user", "assistant", "system",
+            "<tool_call>", "<tool_response>"
+        ]
+        self.banned_ids = []
+        if self.tokenizer.eos_token_id is not None:
+            self.banned_ids.append(self.tokenizer.eos_token_id)
+        for s in forbidden_strings:
+            ids = self.tokenizer.encode(s, add_special_tokens=False)
+            if ids:
+                self.banned_ids.extend(ids)
+        self.banned_ids = sorted(list(set(self.banned_ids)))
+        print(f"Banned {len(self.banned_ids)} structural tokens from Abstract Mode.")
+    @property
+    def device(self):
+        return self.embed_layer.weight.device
+    def _init_cache(self, batch_size, max_length):
+        try:
+            module = importlib.import_module(self.model_backbone.__module__)
+            if hasattr(module, "HybridMambaAttentionDynamicCache"):
+                CacheClass = getattr(module, "HybridMambaAttentionDynamicCache")
+                sig = inspect.signature(CacheClass.__init__)
+                kwargs = {}
+                if 'config' in sig.parameters: kwargs['config'] = self.config
+                if 'batch_size' in sig.parameters: kwargs['batch_size'] = batch_size
+                elif 'max_batch_size' in sig.parameters: kwargs['max_batch_size'] = batch_size
+                if 'max_cache_len' in sig.parameters: kwargs['max_cache_len'] = max_length
+                elif 'max_length' in sig.parameters: kwargs['max_length'] = max_length
+                if 'device' in sig.parameters: kwargs['device'] = self.device
+                if 'dtype' in sig.parameters: kwargs['dtype'] = self.embed_layer.weight.dtype
+                return CacheClass(**kwargs)
+        except Exception: pass
+        from transformers import DynamicCache
+        cache = DynamicCache()
+        cache.has_previous_state = False
+        return cache
+    def forward(
+        self,
+        input_ids,
+        max_length=512,
+        temperature=0.7,
+        sample=False,
+        no_grad=True,
+        sigma=0.0,
+        max_thinking_steps=64
+    ):
+        if input_ids.device != self.device:
+            input_ids = input_ids.to(self.device)
+        if no_grad:
+            with torch.no_grad():
+                initial_embeddings = self.embed_layer(input_ids.unsqueeze(0)).squeeze(0)
+        else:
+            initial_embeddings = self.embed_layer(input_ids.unsqueeze(0)).squeeze(0)
+        in_abstract_mode = True
+        abstract_step_count = 0
+        generated_tokens = []
+        all_logits = []
+        mode_sequence = []
+        past_key_values = self._init_cache(batch_size=1, max_length=max_length + input_ids.shape[0] + 16)
+        current_step_input = initial_embeddings.unsqueeze(0)
+        current_seq_len = initial_embeddings.shape[0]
+        context = torch.no_grad() if no_grad else torch.enable_grad()
+        with context:
+            for step in range(max_length):
+                if step == 0:
+                    position_ids = torch.arange(0, current_seq_len, dtype=torch.long, device=self.device).unsqueeze(0)
+                else:
+                    position_ids = torch.tensor([[current_seq_len - 1]], dtype=torch.long, device=self.device)
+                outputs = self.model_backbone(
+                    inputs_embeds=current_step_input,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    use_cache=True
+                )
+                past_key_values = outputs.past_key_values
+                last_hidden = outputs.last_hidden_state[0, -1, :]
+                # 1. Natural Head (Used for stopping condition)
+                logits = self.lm_head(last_hidden)
+                stop_probs = F.softmax(logits.float(), dim=-1)
+                natural_next_token = torch.argmax(stop_probs, dim=-1).item()
+                # Force Stop Condition
+                force_stop = False
+                if in_abstract_mode:
+                    abstract_step_count += 1
+                    if abstract_step_count >= max_thinking_steps:
+                        natural_next_token = self.end_think_id
+                        force_stop = True
+                # 2. Logic Flow
+                if (natural_next_token == self.end_think_id or force_stop) and in_abstract_mode:
+                    # Transition to Natural
+                    in_abstract_mode = False
+                    mode_sequence.append('T')
+                    generated_tokens.append(self.end_think_id)
+                    next_embedding = self.embed_layer(torch.tensor([[self.end_think_id]], device=self.device)).squeeze(0).squeeze(0)
+                elif in_abstract_mode:
+                    # Abstract Generation
+                    mode_sequence.append('A')
+                    cont_logits = self.continuous_head(last_hidden)
+                    if self.banned_ids:
+                        cont_logits[self.banned_ids] = float('-inf')
+                    cont_logits_f32 = cont_logits.float() / (temperature if temperature else 1.0)
+                    abstract_vis_token = torch.argmax(cont_logits_f32, dim=-1).item()
+                    generated_tokens.append(abstract_vis_token)
+                    top_k = min(256, self.vocab_size // 4)
+                    top_logits, top_indices = torch.topk(cont_logits_f32, top_k, dim=-1)
+                    top_probs = F.softmax(top_logits, dim=-1).to(torch.bfloat16)
+                    top_embeddings = self.continuous_embed_layer(top_indices)
+                    next_embedding = top_probs @ top_embeddings
+                    if sigma > 0.0 and not no_grad:
+                        next_embedding = next_embedding + (torch.randn_like(next_embedding) * sigma)
+                else:
+                    # Natural Generation
+                    mode_sequence.append('N')
+                    generated_tokens.append(natural_next_token)
+                    next_embedding = self.embed_layer(torch.tensor([[natural_next_token]], device=self.device)).squeeze(0).squeeze(0)
+                if no_grad: all_logits.append(logits.detach().cpu())
+                if natural_next_token == self.tokenizer.eos_token_id and not in_abstract_mode:
+                    break
+                current_step_input = next_embedding.unsqueeze(0).unsqueeze(0)
+                current_seq_len += 1
+        return {
+            'generated_tokens': torch.tensor(generated_tokens),
+            'logits': torch.stack(all_logits) if all_logits else torch.tensor([]),
+            'mode_sequence': mode_sequence,
+        }
+    def save_to_directory(self, output_dir):
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        try:
+            head_state = {k: v.cpu() for k, v in self.continuous_head.state_dict().items()}
+            embed_state = {k: v.cpu() for k, v in self.continuous_embed_layer.state_dict().items()}
+            torch.save(head_state, output_path / "continuous_head.pt")
+            torch.save(embed_state, output_path / "continuous_embed.pt")
+            config = {'sft_model_path': str(self.sft_model_path), 'hidden_size': self.hidden_size, 'vocab_size': self.vocab_size}
+            with open(output_path / "config.json", 'w') as f: json.dump(config, f)
+            print(f"Saved model to {output_dir}")
+        except Exception as e: print(f"Error saving model: {e}")
+    @staticmethod
+    def load_from_directory(output_dir, sft_model_path=None, device='cuda:0'):
+        output_path = Path(output_dir)
+        with open(output_path / "config.json", 'r') as f: config = json.load(f)
+        if sft_model_path is None: sft_model_path = config['sft_model_path']
+        model = AbstractModel(sft_model_path, device=device)
+        print(f"Loading checkpoint to {model.device}...")
+        head_state = torch.load(output_path / "continuous_head.pt", map_location=model.device)
+        embed_state = torch.load(output_path / "continuous_embed.pt", map_location=model.device)
+        model.continuous_head.load_state_dict(head_state)
+        model.continuous_embed_layer.load_state_dict(embed_state)
+        model.continuous_head = model.continuous_head.to(torch.bfloat16)
+        model.continuous_embed_layer = model.continuous_embed_layer.to(torch.bfloat16)
+        return model
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--sft-model', required=True)
+    parser.add_argument('--load-model', default=None)
+    parser.add_argument('--max-length', type=int, default=256)
+    parser.add_argument('--temperature', type=float, default=0.7)
+    args = parser.parse_args()
+    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+    model = AbstractModel.load_from_directory(args.load_model, sft_model_path=args.sft_model, device=device)
+    print("\n" + "=" * 70)
+    print(f"Abstract Model - Interactive Generation (Masked & Budgeted)")
+    print("=" * 70 + "\n")
+    while True:
+        try:
+            prompt = input("You: ").strip()
+            if not prompt: continue
+            if prompt.lower() in ['q', 'quit']: break
+            sys_prompt = "You are a reasoning assistant. Think step by step before answering."
+            messages = [{"role": "system", "content": sys_prompt}, {"role": "user", "content": prompt}]
+            formatted = model.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            input_ids = model.tokenizer(formatted, return_tensors='pt', add_special_tokens=False)['input_ids'].to(model.device).squeeze(0)
+            print("Generating...", end="\r")
+            result = model.forward(
+                input_ids,
+                max_length=args.max_length,
+                temperature=args.temperature,
+                sample=False,
+                no_grad=True,
+                sigma=0.0,
+                max_thinking_steps=128
+            )
+            generated_ids = result['generated_tokens'].tolist()
+            modes = result['mode_sequence']
+            print("Assistant: ", end="")
+            for token_id, mode in zip(generated_ids, modes):
+                token_text = model.tokenizer.decode([token_id])
+                if mode == 'A':
+                    print(f"\033[96m{token_text}\033[0m", end="", flush=True)
+                else:
+                    print(token_text, end="", flush=True)
+            print("\n")
+            print(f"[Stats] Abstract: {modes.count('A')} | Natural: {modes.count('N')}")
+            print("-" * 70)
+        except KeyboardInterrupt: break
+        except Exception as e: print(f"\nError: {e}")

create_initialized_abstract.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env python3
+"""
+Create initialized Abstract model checkpoint.
+"""
+import argparse
+import torch
+import os
+from pathlib import Path
+from abstract_model import AbstractModel
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--sft-model', required=True, help='Path to SFT model')
+    parser.add_argument('--output', required=True, help='Output directory for initialized model')
+    args = parser.parse_args()
+    print(f"Loading SFT model from: {args.sft_model}")
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model = AbstractModel(args.sft_model, device=device)
+    print(f"Saving initialized model to: {args.output}")
+    os.makedirs(args.output, exist_ok=True)
+    model.save_to_directory(args.output)
+if __name__ == "__main__":
+    main()

eval_simple.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import torch
+import json
+import re
+import os
+import time
+import random
+import torch.multiprocessing as mp
+from tqdm import tqdm
+from abstract_model import AbstractModel
+RL_MODEL_PATH = "pathtocontinuoushead"
+FALLBACK_SFT_PATH = "pathtobasemodel"
+DATASET_FILES = [
+    "../bench/mmlu.jsonl",
+    "../bench/gsm8k.jsonl",
+    "../bench/drop.jsonl"
+]
+SAMPLES_PER_BENCHMARK = 1024
+MAX_THINKING_STEPS = 256
+MAX_TOTAL_LENGTH = 1536
+LOG_FILE = "eval_results_random.jsonl"
+def normalize_text(s):
+    import string
+    if s is None: return ""
+    def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
+    def white_space_fix(text): return ' '.join(text.split())
+    def remove_punc(text): return ''.join(ch for ch in text if ch not in set(string.punctuation))
+    return white_space_fix(remove_articles(remove_punc(str(s).lower())))
+def extract_answer_content(text):
+    match = re.search(r"<ANSWER>(.*?)</ANSWER>", text, re.DOTALL)
+    if match: return match.group(1).strip()
+    return None
+def load_and_sample_data(files, samples_per_file):
+    """
+    Loads full datasets and randomly samples N items from each.
+    """
+    final_data = []
+    for filename in files:
+        if not os.path.exists(filename):
+            print(f"Warning: File {filename} not found. Skipping.")
+            continue
+        # Detect benchmark type
+        fname_lower = filename.lower()
+        if "mmlu" in fname_lower: bench_type = "mmlu"
+        elif "gsm8k" in fname_lower: bench_type = "gsm8k"
+        elif "drop" in fname_lower: bench_type = "drop"
+        else: bench_type = "unknown"
+        print(f"Loading {filename} ({bench_type})...")
+        file_data = []
+        with open(filename, 'r', encoding='utf-8') as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                    if "benchmark" not in entry:
+                        entry["benchmark"] = bench_type
+                    file_data.append(entry)
+                except: continue
+        total_lines = len(file_data)
+        if total_lines > samples_per_file:
+            random.shuffle(file_data)
+            selected_data = file_data[:samples_per_file]
+            print(f"  -> Randomly sampled {samples_per_file} from {total_lines} samples.")
+        else:
+            selected_data = file_data
+            print(f"  -> Took all {total_lines} samples (less than requested limit).")
+        final_data.extend(selected_data)
+    return final_data
+def score_sample(pred, truth, benchmark):
+    if benchmark == 'mmlu':
+        p = extract_answer_content(pred)
+        if not p: return False
+        m = re.search(r'([A-D])', p.upper())
+        return m.group(1) == truth.strip().upper() if m else False
+    elif benchmark == 'gsm8k':
+        p = extract_answer_content(pred)
+        if not p: return False
+        t = truth.split("####")[-1].strip() if "####" in truth else truth.strip()
+        return normalize_text(t) in normalize_text(p)
+    else:
+        p = extract_answer_content(pred)
+        if not p: return False
+        return normalize_text(p) == normalize_text(truth)
+def gpu(gpu_id, head_path, sft_path, dataset_chunk, results_queue):
+    torch.cuda.set_device(gpu_id)
+    device = f"cuda:{gpu_id}"
+    if not os.path.exists(os.path.join(head_path, "continuous_head.pt")):
+        print(f"[GPU {gpu_id}] Critical: continuous_head.pt not found in {head_path}")
+        return
+    print(f"[GPU {gpu_id}] Loading Model...")
+    try:
+        model = AbstractModel.load_from_directory(
+            head_path,
+            sft_model_path=sft_path,
+            device=device
+        )
+    except Exception as e:
+        print(f"[GPU {gpu_id}] Error loading model: {e}")
+        return
+    results = []
+    iterator = tqdm(dataset_chunk, desc=f"GPU {gpu_id}", position=gpu_id, leave=True)
+    for item in iterator:
+        try:
+            sys_prompt = "You are a reasoning assistant. Think step by step before answering."
+            messages = [{"role": "system", "content": sys_prompt}, {"role": "user", "content": item['question']}]
+            formatted = model.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            input_ids = model.tokenizer(formatted, return_tensors='pt', add_special_tokens=False)['input_ids'].to(device).squeeze(0)
+            out = model.forward(
+                input_ids,
+                max_length=MAX_TOTAL_LENGTH,
+                temperature=0.0,
+                sample=False,
+                no_grad=True,
+                sigma=0.0,
+                max_thinking_steps=MAX_THINKING_STEPS
+            )
+            full_text = ""
+            for token_id in out['generated_tokens'].tolist():
+                full_text += model.tokenizer.decode([token_id])
+            is_correct = score_sample(full_text, item['answer'], item['benchmark'])
+            results.append({
+                "benchmark": item['benchmark'],
+                "correct": is_correct,
+                "think_steps": out['mode_sequence'].count('A'),
+                "prediction": full_text
+            })
+        except Exception as e:
+            print(f"[GPU {gpu_id}] Error: {e}")
+            continue
+    results_queue.put(results)
+def run_evaluation():
+    all_data = load_and_sample_data(DATASET_FILES, SAMPLES_PER_BENCHMARK)
+    if not all_data:
+        print("No data loaded. Exiting.")
+        return
+    print(f"Total Evaluation Set: {len(all_data)} samples.")
+    mid = len(all_data) // 2
+    queue = mp.Queue()
+    p1 = mp.Process(target=gpu, args=(0, RL_MODEL_PATH, FALLBACK_SFT_PATH, all_data[:mid], queue))
+    p2 = mp.Process(target=gpu, args=(1, RL_MODEL_PATH, FALLBACK_SFT_PATH, all_data[mid:], queue))
+    start_time = time.time()
+    p1.start(); p2.start()
+    final_results = []
+    for _ in range(2): final_results.extend(queue.get())
+    p1.join(); p2.join()
+    print(f"Saving detailed logs to {LOG_FILE}...")
+    with open(LOG_FILE, 'w') as f:
+        for r in final_results: f.write(json.dumps(r) + '\n')
+    metrics = {}
+    for res in final_results:
+        b = res['benchmark']
+        if b not in metrics: metrics[b] = {'correct': [], 'steps': []}
+        metrics[b]['correct'].append(res['correct'])
+        metrics[b]['steps'].append(res['think_steps'])
+    print("\n" + "="*50)
+    print(f"FINAL SCORES (Random Sample N={SAMPLES_PER_BENCHMARK})")
+    print("="*50)
+    for b, d in metrics.items():
+        acc = sum(d['correct']) / len(d['correct']) * 100
+        avg_steps = sum(d['steps']) / len(d['steps'])
+        print(f"{b.upper():<10} | Acc: {acc:.2f}% | Avg Steps: {avg_steps:.1f} | N: {len(d['correct'])}")
+    print(f"Total time: {time.time() - start_time:.2f}s")
+if __name__ == "__main__":
+    mp.set_start_method('spawn', force=True)
+    run_evaluation()

test_soft_embedding_with_trigger.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python3
+"""
+Test soft embedding with trigger-based mode switching.
+"""
+import argparse
+import torch
+import torch.nn.functional as F
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForCausalLM
+class TriggerHead(torch.nn.Module):
+    def __init__(self, hidden_size, hidden_dim=1024):
+        super().__init__()
+        self.w_gate = torch.nn.Linear(hidden_size, hidden_dim, bias=True)
+        self.w_value = torch.nn.Linear(hidden_size, hidden_dim, bias=True)
+        self.w_out = torch.nn.Linear(hidden_dim, 1, bias=True)
+    def forward(self, x):
+        gate = self.w_gate(x)
+        value = self.w_value(x)
+        activated = F.silu(gate) * value
+        x = self.w_out(activated)
+        return x.squeeze(-1)
+def main():
+    parser = argparse.ArgumentParser(description="Test Soft Embedding with Trigger")
+    parser.add_argument('--sft-model', required=True, help='Path to SFT model')
+    parser.add_argument('--trigger-head', required=True, help='Path to trigger head checkpoint dir')
+    parser.add_argument('--max-length', type=int, default=256, help='Max generation length')
+    parser.add_argument('--threshold', type=float, default=0.5, help='Trigger threshold (>threshold = abstract mode)')
+    parser.add_argument('--temperature', type=float, default=1.0, help='Temperature for softmax')
+    args = parser.parse_args()
+    print("=" * 70)
+    print("Testing Soft Embedding with Trigger-Based Mode Switching")
+    print("=" * 70)
+    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+    print(f"\nLoading tokenizer from {args.sft_model}...")
+    tokenizer = AutoTokenizer.from_pretrained(args.sft_model, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(f"Loading SFT model from {args.sft_model}...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.sft_model,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        device_map=None
+    ).to(device)
+    model.eval()
+    hidden_size = model.config.hidden_size
+    embed_layer = model.get_input_embeddings()
+    print(f"Loading trigger head from {args.trigger_head}...")
+    trigger_head = TriggerHead(hidden_size).to(device)
+    checkpoint_path = Path(args.trigger_head) / "trigger_head.pt"
+    if not checkpoint_path.exists():
+        print(f"Error: Checkpoint not found at {checkpoint_path}")
+        return
+    trigger_state = torch.load(checkpoint_path, map_location=device)
+    trigger_head.load_state_dict(trigger_state)
+    trigger_head.eval()
+    print("Models loaded.\n")
+    mode_stats = {'natural': 0, 'abstract': 0}
+    while True:
+        prompt = input("You: ").strip()
+        if prompt.lower() in ['quit', 'exit', 'q']:
+            break
+        if not prompt:
+            continue
+        messages = [{"role": "user", "content": prompt}]
+        formatted = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        input_ids = tokenizer(
+            formatted,
+            return_tensors='pt',
+            add_special_tokens=False
+        )['input_ids'].to(device)
+        print("Assistant: ", end="", flush=True)
+        generated_tokens = []
+        mode_sequence = []
+        with torch.no_grad():
+            current_embeddings = embed_layer(input_ids).squeeze(0)
+            next_mode = 'N'
+            while len(generated_tokens) + len(input_ids[0]) < args.max_length:
+                outputs = model.model(
+                    inputs_embeds=current_embeddings.unsqueeze(0),
+                    use_cache=False
+                )
+                hidden_state = outputs.last_hidden_state[0, -1]
+                hidden_state_normalized = F.normalize(hidden_state.float(), p=2, dim=-1)
+                trigger_logits = trigger_head(hidden_state_normalized.unsqueeze(0))
+                trigger_prob = torch.sigmoid(trigger_logits).item()
+                next_mode = 'S' if trigger_prob > args.threshold else 'N'
+                logits = model.lm_head(hidden_state)
+                logits = logits / args.temperature
+                probs = F.softmax(logits, dim=-1)
+                if next_mode == 'S':
+                    mode_sequence.append('S')
+                    embed_matrix = embed_layer.weight.float()
+                    next_embedding = probs.float() @ embed_matrix
+                    next_embedding = next_embedding.to(torch.bfloat16)
+                    next_token = torch.argmax(probs).item()
+                    token_text = tokenizer.decode([next_token])
+                    print(f"<abstract>{token_text}", end="", flush=True)
+                else:
+                    mode_sequence.append('N')
+                    next_token = torch.argmax(probs).item()
+                    next_embedding = embed_layer(torch.tensor([[next_token]], device=device)).squeeze(0).squeeze(0)
+                    token_text = tokenizer.decode([next_token])
+                    print(token_text, end="", flush=True)
+                if next_token == tokenizer.eos_token_id:
+                    break
+                generated_tokens.append(next_token)
+                current_embeddings = torch.cat([current_embeddings, next_embedding.unsqueeze(0)], dim=0)
+        print("\n")
+        if mode_sequence:
+            n_count = mode_sequence.count('N')
+            s_count = mode_sequence.count('S')
+            mode_stats['natural'] += n_count
+            mode_stats['abstract'] += s_count
+            print(f"[Tokens: Natural={n_count}, Switch={s_count}, switch_ratio={s_count/(n_count+s_count)*100:.1f}%]\n")
+    print("\n" + "=" * 70)
+    print("Session Statistics:")
+    print(f"  Natural mode tokens: {mode_stats['natural']}")
+    print(f"  Switch point tokens: {mode_stats['abstract']}")
+    if mode_stats['natural'] + mode_stats['abstract'] > 0:
+        total = mode_stats['natural'] + mode_stats['abstract']
+        print(f"  Switch ratio: {mode_stats['abstract']/total*100:.1f}%")
+if __name__ == '__main__':
+    main()