MistyozAI
/

CosmicFish-120M

+"""
+Chat interface for CosmicFish model directly from Hugging Face Hub.
+Automatically downloads and caches the model from HF Hub.
+"""
+import os
+import sys
+import time
+import argparse
+import torch
+import numpy as np
+from termcolor import colored
+import logging
+import readline  # Enables arrow key history in terminal input
+import re
+import textwrap
+import random
+from collections import defaultdict
+import json
+# Required imports for HF Hub
+try:
+    from transformers import GPT2Tokenizer
+    from huggingface_hub import hf_hub_download, snapshot_download
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    print("❌ Required libraries not available.")
+    print("Install with: pip install transformers huggingface-hub")
+    sys.exit(1)
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+# Default model repository
+DEFAULT_MODEL_REPO = "Mistyoz-AI/CosmicFish-120M"
+# Default prompt template
+DEFAULT_PROMPT_TEMPLATE = "Below is a conversation between a helpful AI assistant and a human. The assistant is knowledgeable, friendly, and provides detailed and accurate responses.\n\n"
+class CosmicConfig:
+    """Configuration class for CosmicFish."""
+    def __init__(self,
+                 vocab_size=50257,
+                 block_size=512,
+                 n_layer=12,
+                 n_head=16,
+                 n_embd=704,
+                 bias=True,
+                 dropout=0.0,  # Always 0 for inference
+                 n_query_groups=4,
+                 eps=1e-6,
+                 use_rotary=True,
+                 use_swiglu=True,
+                 use_qk_norm=False,
+                 use_gqa=True):
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.bias = bias
+        self.dropout = dropout
+        self.eps = eps
+        self.use_rotary = use_rotary
+        self.use_swiglu = use_swiglu
+        self.use_qk_norm = use_qk_norm
+        self.use_gqa = use_gqa
+        self.n_query_groups = n_query_groups if use_gqa else n_head
+        # Ensure n_head is divisible by n_query_groups
+        assert n_head % self.n_query_groups == 0, "n_head must be divisible by n_query_groups"
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Normalization"""
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        rms = torch.sqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+        return self.weight * (x / rms)
+def precompute_freqs_cis(dim, end, theta=10000.0):
+    """Precompute the frequency tensor for complex exponentials (cis)"""
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs_cis
+def apply_rotary_emb(xq, xk, freqs_cis):
+    """Apply rotary embeddings to input tensors"""
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    seq_len = xq_.size(2)
+    if freqs_cis.size(0) < seq_len:
+        raise ValueError(f"freqs_cis has only {freqs_cis.size(0)} values but sequence length is {seq_len}")
+    freqs_cis_seq = freqs_cis[:seq_len]
+    xq_out = torch.view_as_real(xq_ * freqs_cis_seq.unsqueeze(0)).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis_seq.unsqueeze(0)).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class GroupedQueryAttention(torch.nn.Module):
+    """Grouped Query Attention (GQA) implementation"""
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        head_dim = config.n_embd // config.n_head
+        self.head_dim = head_dim
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.n_query_groups = config.n_query_groups
+        self.kv_heads = config.n_head // config.n_query_groups if config.use_gqa else config.n_head
+        qkv_proj_size = (config.n_head + 2 * self.kv_heads) * head_dim
+        self.c_attn = torch.nn.Linear(config.n_embd, qkv_proj_size, bias=config.bias)
+        self.c_proj = torch.nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # Flash attention support
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                 .view(1, 1, config.block_size, config.block_size))
+        # Query-key normalization
+        self.qk_norm = getattr(config, 'use_qk_norm', False)
+        if self.qk_norm:
+            self.q_norm = RMSNorm(head_dim, eps=getattr(config, 'eps', 1e-6))
+            self.k_norm = RMSNorm(head_dim, eps=getattr(config, 'eps', 1e-6))
+    def forward(self, x, freqs_cis=None):
+        B, T, C = x.size()
+        qkv = self.c_attn(x)
+        head_dim = C // self.n_head
+        q_size = self.n_head * head_dim
+        k_size = self.kv_heads * head_dim
+        v_size = self.kv_heads * head_dim
+        q, k, v = qkv.split([q_size, k_size, v_size], dim=2)
+        q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        k = k.view(B, T, self.kv_heads, head_dim).transpose(1, 2)
+        v = v.view(B, T, self.kv_heads, head_dim).transpose(1, 2)
+        # Repeat k and v if needed for GQA
+        if self.kv_heads < self.n_head:
+            repeats = self.n_head // self.kv_heads
+            k = k.repeat_interleave(repeats, dim=1)
+            v = v.repeat_interleave(repeats, dim=1)
+        # Apply rotary embeddings
+        if freqs_cis is not None:
+            q, k = apply_rotary_emb(q, k, freqs_cis)
+        # Apply query-key normalization
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        # Compute attention
+        if self.flash:
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True
+            )
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / torch.sqrt(torch.tensor(k.size(-1), dtype=torch.float32)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
+            att = torch.nn.functional.softmax(att, dim=-1)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.c_proj(y)
+        return y
+class Block(torch.nn.Module):
+    """Transformer block"""
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd, eps=config.eps)
+        self.ln_2 = RMSNorm(config.n_embd, eps=config.eps)
+        self.attn = GroupedQueryAttention(config)
+        # MLP implementation based on configuration
+        if config.use_swiglu:
+            # SwiGLU MLP
+            self.mlp = torch.nn.ModuleDict(dict(
+                gate=torch.nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                up=torch.nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                down=torch.nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias),
+                act=torch.nn.SiLU(),
+            ))
+            m = self.mlp
+            self.mlpf = lambda x: m.down(m.act(m.up(x)) * m.gate(x))
+        else:
+            # Traditional MLP
+            self.mlp = torch.nn.ModuleDict(dict(
+                c_fc=torch.nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                c_proj=torch.nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias),
+                act=torch.nn.GELU(),
+            ))
+            m = self.mlp
+            self.mlpf = lambda x: m.c_proj(m.act(m.c_fc(x)))
+    def forward(self, x, freqs_cis=None):
+        x = x + self.attn(self.ln_1(x), freqs_cis)
+        x = x + self.mlpf(self.ln_2(x))
+        return x
+class CosmicFish(torch.nn.Module):
+    """
+    CosmicFish model for inference only.
+    Features: Rotary Positional Embeddings, Grouped-Query Attention, SwiGLU, RMSNorm
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = torch.nn.ModuleDict(dict(
+            wte=torch.nn.Embedding(config.vocab_size, config.n_embd),
+            h=torch.nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f=RMSNorm(config.n_embd, eps=config.eps),
+        ))
+        self.lm_head = torch.nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Share weights between embedding and output
+        self.transformer.wte.weight = self.lm_head.weight
+        # Precompute rotary embedding frequencies
+        if config.use_rotary:
+            head_dim = config.n_embd // config.n_head
+            self.freqs_cis = precompute_freqs_cis(head_dim, config.block_size)
+        else:
+            self.freqs_cis = None
+            self.transformer.wpe = torch.nn.Embedding(config.block_size, config.n_embd)
+    def get_num_params(self, non_embedding=True):
+        """Return the number of parameters in the model."""
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding and hasattr(self.transformer, 'wpe'):
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def forward(self, idx, targets=None):
+        """Forward pass through the model."""
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        # Get token embeddings
+        tok_emb = self.transformer.wte(idx)
+        # Handle positional embeddings
+        if self.config.use_rotary:
+            x = tok_emb
+            freqs_cis = self.freqs_cis.to(device) if self.freqs_cis is not None else None
+        else:
+            pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
+            pos_emb = self.transformer.wpe(pos)
+            x = tok_emb + pos_emb
+            freqs_cis = None
+        # Apply transformer blocks
+        for block in self.transformer.h:
+            x = block(x, freqs_cis)
+        # Apply final normalization
+        x = self.transformer.ln_f(x)
+        # Calculate outputs
+        if targets is not None:
+            logits = self.lm_head(x)
+            loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # For inference, only compute logits for the last token
+            logits = self.lm_head(x[:, [-1], :])
+            loss = None
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Generate text by sampling from the model, token by token.
+        """
+        for _ in range(max_new_tokens):
+            # Crop sequence to block size if needed
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # Forward pass
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            # Apply top-k sampling
+            if top_k is not None:
+                v, _ = torch.topk(logits, top_k)
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # Sample next token
+            probs = torch.nn.functional.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # Append to sequence
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+class RepetitionPenaltyLogitsProcessor:
+    """Apply repetition penalty to prevent repeating tokens."""
+    def __init__(self, penalty=1.2):
+        self.penalty = penalty
+    def __call__(self, input_ids, scores):
+        """Apply repetition penalty to logits where input_ids is already seen."""
+        score = torch.gather(scores, 1, input_ids)
+        # If score > 0, penalize by dividing; if score < 0, penalize by multiplying
+        score = torch.where(score > 0, score / self.penalty, score * self.penalty)
+        scores.scatter_(1, input_ids, score)
+        return scores
+class CosmicFishChatSession:
+    """Chat session for CosmicFish model from Hugging Face Hub."""
+    def __init__(self, model, tokenizer, config):
+        """Initialize chat session with model and configuration."""
+        self.model = model
+        self.tokenizer = tokenizer
+        self.config = config
+        self.device = next(model.parameters()).device
+        self.history = []
+        self.history_tokens = []
+        self.max_history_tokens = config.max_history_tokens
+        self.prompt_template = config.prompt_template
+        self.human_prefix = config.human_prefix
+        self.assistant_prefix = config.assistant_prefix
+        self.end_of_turn = config.end_of_turn
+        self.block_size = config.block_size
+        self.debug_mode = config.debug_mode
+        self.repetition_penalty = config.repetition_penalty
+        self.min_tokens_to_generate = config.min_tokens_to_generate
+        self.max_retries = 20
+        self.fallback_responses = [
+            "I'd be happy to help with that. Could you provide more details about what specific information you're looking for?",
+            "That's a topic I can provide information about. What specific aspects would you like to know?",
+            "I understand your question. I can share factual information on this topic if you could specify what aspects you're interested in.",
+            "I can help with your question. To give you the most relevant information, could you clarify what specific details you're looking for?",
+            "I'd be glad to address your question. To provide the most helpful response, could you specify what particular aspects of this topic interest you?"
+        ]
+        self.generation_failure_message = "I'm sorry, but I'm having difficulty generating a response to that prompt. Could you try rephrasing your question or asking something else?"
+        # For token counting
+        self.total_prompt_tokens = 0
+        self.total_generated_tokens = 0
+        # End markers for live generation
+        self.end_markers = [
+            f"{self.human_prefix}",
+            "Human:",
+            "\nHuman:",
+            "\nH:",
+            "H:",
+            "<|endoftext|>",
+            "Below is a conversation",
+            "\nA:",
+            "A:",
+            "</s>",
+            "User:",
+            "\nUser:"
+        ]
+        # Print welcome message
+        if config.display_welcome:
+            self._print_welcome_message()
+    def _print_welcome_message(self):
+        """Print a welcome message to the user."""
+        welcome_text = f"""
+{'=' * 80}
+Welcome to CosmicFish chat interface (Hugging Face Hub)
+This is a {self.model.get_num_params() / 1e6:.1f}M parameter model loaded from HF Hub.
+CosmicFish features advanced architecture with RoPE, GQA, SwiGLU, and RMSNorm.
+Model: {DEFAULT_MODEL_REPO}
+Type your prompts and CosmicFish will respond.
+Special commands:
+- /help: Show this help message
+- /clear: Clear the conversation history
+- /exit or /quit: Exit the chat
+- /stats: Show token usage statistics
+- /save [filename]: Save the conversation
+- /load [filename]: Load a conversation
+- /temp [value]: Set temperature (between 0.1 and 2.0)
+- /penalty [value]: Set repetition penalty (1.0-2.0)
+- /debug: Toggle debug mode
+{'=' * 80}
+"""
+        print(colored(welcome_text, 'cyan'))
+    def _format_prompt(self, user_input):
+        """Format the complete prompt with history and current input."""
+        # Start with the template
+        formatted_prompt = self.prompt_template
+        # Add conversation history
+        for entry in self.history:
+            role, text = entry
+            if role == "human":
+                formatted_prompt += f"{self.human_prefix}{text}{self.end_of_turn}"
+            else:  # assistant
+                formatted_prompt += f"{self.assistant_prefix}{text}{self.end_of_turn}"
+        # Add the current user input
+        formatted_prompt += f"{self.human_prefix}{user_input}{self.end_of_turn}{self.assistant_prefix}"
+        return formatted_prompt
+    def _tokenize(self, text):
+        """Tokenize text and return token IDs."""
+        return self.tokenizer.encode(text)
+    def _update_history(self, user_input, response):
+        """Update conversation history."""
+        # Add to text history
+        self.history.append(("human", user_input))
+        self.history.append(("assistant", response))
+        # Update token history for context window management
+        user_tokens = self._tokenize(f"{self.human_prefix}{user_input}{self.end_of_turn}")
+        response_tokens = self._tokenize(f"{self.assistant_prefix}{response}{self.end_of_turn}")
+        self.history_tokens.extend(user_tokens)
+        self.history_tokens.extend(response_tokens)
+        # Track token usage
+        self.total_prompt_tokens += len(user_tokens)
+        self.total_generated_tokens += len(response_tokens)
+        # Trim history if it gets too long
+        self._trim_history_if_needed()
+    def _trim_history_if_needed(self):
+        """Trim history to fit within the context window."""
+        if len(self.history_tokens) > self.max_history_tokens:
+            # Remove oldest turns until we're under the limit
+            while len(self.history_tokens) > self.max_history_tokens and len(self.history) >= 2:
+                # Remove oldest human and assistant turn
+                self.history = self.history[2:]
+                # Find token boundary for the removed turns
+                user_turn = self.history[0][1]
+                assistant_turn = self.history[1][1]
+                user_tokens = len(self._tokenize(f"{self.human_prefix}{user_turn}{self.end_of_turn}"))
+                assistant_tokens = len(self._tokenize(f"{self.assistant_prefix}{assistant_turn}{self.end_of_turn}"))
+                # Update token history
+                self.history_tokens = self.history_tokens[user_tokens + assistant_tokens:]
+    def _should_stop_generation(self, text):
+        """Check if generation should stop based on end markers."""
+        for marker in self.end_markers:
+            if marker in text:
+                return True
+        return False
+    def _clean_token_text(self, text):
+        """Clean token text by fixing encoding issues."""
+        # Fix the specific issue with �� -> '
+        text = text.replace('��', "'")
+        return text
+    def generate_with_repetition_penalty(self, input_ids, max_new_tokens, temperature, top_k, penalty=1.2, live=False):
+        """Custom generate function with repetition penalty and optional live generation."""
+        model = self.model
+        device = self.device
+        # Ensure model is in eval mode
+        model.eval()
+        # Initialize sequence with input_ids
+        generated = input_ids.clone()
+        # Initialize live text buffer
+        live_buffer = ""
+        # Create repetition penalty processor
+        rep_processor = RepetitionPenaltyLogitsProcessor(penalty=penalty)
+        # Counter for generated tokens
+        tokens_generated = 0
+        min_tokens = self.min_tokens_to_generate
+        # EOT token ID
+        eot_token_id = self.tokenizer.eos_token_id if hasattr(self.tokenizer, 'eos_token_id') else 50256
+        # Generate tokens one at a time
+        for _ in range(max_new_tokens):
+            # Get only the last block_size tokens if context is too long
+            if generated.size(1) > self.block_size:
+                context = generated[:, -self.block_size:]
+            else:
+                context = generated
+            # Forward pass for next token prediction
+            with torch.no_grad():
+                logits, _ = model(context)
+            # Get logits for the next token (last position)
+            next_token_logits = logits[:, -1, :]
+            # Apply temperature
+            next_token_logits = next_token_logits / temperature
+            # Apply repetition penalty
+            if penalty > 1.0:
+                next_token_logits = rep_processor(context, next_token_logits)
+            # Optional top-k sampling
+            if top_k is not None:
+                indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
+                next_token_logits[indices_to_remove] = float('-inf')
+            # Convert logits to probabilities
+            probs = torch.nn.functional.softmax(next_token_logits, dim=-1)
+            # Sample next token
+            next_token = torch.multinomial(probs, num_samples=1)
+            # Check if the next token is EOT and break immediately if so
+            if next_token.item() == eot_token_id:
+                if live:
+                    yield "", live_buffer, True
+                break
+            # Append next token to generated sequence
+            generated = torch.cat((generated, next_token), dim=1)
+            tokens_generated += 1
+            # If live generation, decode and yield the token
+            if live:
+                # Decode the next token
+                next_token_text = self.tokenizer.decode([next_token.item()])
+                # Clean the token text to fix encoding issues
+                next_token_text = self._clean_token_text(next_token_text)
+                live_buffer += next_token_text
+                # Check if we've hit an end marker in the buffer
+                eot_marker_pos = live_buffer.find("<|endoftext|>")
+                if eot_marker_pos != -1:
+                    # Cut off at the EOT marker
+                    live_buffer = live_buffer[:eot_marker_pos]
+                    yield "", live_buffer, True
+                    break
+                # Check other end markers
+                should_stop = tokens_generated >= min_tokens and self._should_stop_generation(live_buffer)
+                yield next_token_text, live_buffer, should_stop
+                if should_stop:
+                    break
+            # For non-live generation, check if we should stop
+            elif tokens_generated >= min_tokens:
+                # Check for end markers in the recent generated tokens
+                recent_text = self.tokenizer.decode(generated[0, -20:].tolist())
+                if self._should_stop_generation(recent_text):
+                    break
+        # Check if we generated any tokens at all
+        if tokens_generated == 0 and not live:
+            if self.debug_mode:
+                print(colored("\n[No tokens generated in this attempt]", "red"))
+            return None
+        if not live:
+            return generated
+    def generate_response(self, user_input):
+        """Generate a response to the user input."""
+        # Format the complete prompt
+        prompt = self._format_prompt(user_input)
+        # Tokenize the prompt
+        input_ids = torch.tensor(self._tokenize(prompt), dtype=torch.long).unsqueeze(0).to(self.device)
+        # Ensure we don't exceed the model's context length
+        if input_ids.size(1) > self.block_size:
+            # If too long, keep the beginning part with the instruction template and trim the middle
+            instruction_tokens = self._tokenize(self.prompt_template)
+            # Keep the instruction and the most recent conversation that will fit
+            keep_from_beginning = len(instruction_tokens)
+            keep_from_end = self.block_size - keep_from_beginning
+            # Combine beginning and end, ensuring we don't exceed array bounds
+            if keep_from_end < 0:
+                # If instruction alone is too long, trim it (shouldn't happen with reasonable templates)
+                input_ids = input_ids[:, :self.block_size]
+            else:
+                # Keep instruction and most recent conversation
+                input_ids = torch.cat([
+                    input_ids[:, :keep_from_beginning],
+                    input_ids[:, -(keep_from_end):]
+                ], dim=1)
+        # Track generation start time
+        start_time = time.time()
+        # Always use live generation
+        return self._generate_live_response(input_ids, user_input, start_time)
+    def _generate_live_response(self, input_ids, user_input, start_time):
+        """Generate response with live token-by-token output."""
+        # Initialize for live generation
+        live_text = ""
+        tokens_generated = 0
+        retry_count = 0
+        # Keep trying until we get a valid response or exhaust retries
+        while retry_count <= self.max_retries:
+            if retry_count > 0:
+                # Calculate temperature for this retry
+                if retry_count % 2 == 0:
+                    # Even retries: increase temperature
+                    temp_adjustment = min(0.2 * (retry_count // 2), 0.8)
+                    current_temp = min(self.config.temperature + temp_adjustment, 1.8)
+                else:
+                    # Odd retries: decrease temperature
+                    temp_adjustment = min(0.2 * ((retry_count + 1) // 2), 0.4)
+                    current_temp = max(self.config.temperature - temp_adjustment, 0.2)
+                if self.debug_mode:
+                    print(colored(f"\n[Live retry {retry_count}: Using temperature {current_temp:.2f}]", "yellow"))
+            else:
+                current_temp = self.config.temperature
+            # Reset for this attempt
+            live_text = ""
+            tokens_generated = 0
+            generation_failed = False
+            # Try to generate with current settings
+            try:
+                # Generate with live output
+                for token_text, live_buffer, should_stop in self.generate_with_repetition_penalty(
+                        input_ids,
+                        max_new_tokens=self.config.max_new_tokens,
+                        temperature=current_temp,
+                        top_k=self.config.top_k,
+                        penalty=self.repetition_penalty,
+                        live=True
+                ):
+                    # If we should stop but there's a token, this is the last one
+                    if should_stop:
+                        # Update with the final clean buffer (will have EOT removed if present)
+                        live_text = live_buffer
+                        break
+                    # Otherwise add the token and continue
+                    if token_text:
+                        live_text += token_text
+                        tokens_generated += 1
+                        yield token_text, live_text, False
+                # Check if we got a valid response
+                if not live_text or len(live_text.strip()) < 10:
+                    if self.debug_mode:
+                        print(colored("\n[Live generation produced empty or too short response, retrying]", "yellow"))
+                    generation_failed = True
+                    retry_count += 1
+                    # Clear any partial output
+                    if retry_count <= self.max_retries:
+                        print("\r" + " " * 80 + "\r", end="")  # Clear the line
+                else:
+                    # We got a valid response, stop retrying
+                    break
+            except Exception as e:
+                if self.debug_mode:
+                    print(colored(f"\n[Live generation error: {str(e)}, retrying]", "red"))
+                generation_failed = True
+                retry_count += 1
+        # If we still failed after all retries, use the failure message
+        if generation_failed or not live_text or len(live_text.strip()) < 10:
+            live_text = self.generation_failure_message
+            if self.debug_mode:
+                print(colored(f"\n[Returning failure message after {retry_count} live retries]", "red"))
+        # Calculate time taken and metrics
+        time_taken = time.time() - start_time
+        tokens_per_second = tokens_generated / time_taken if time_taken > 0 else 0
+        # Update history
+        self._update_history(user_input, live_text)
+        # Log generation stats
+        logger.debug(f"Generated {tokens_generated} tokens in {time_taken:.2f}s ({tokens_per_second:.2f} tokens/s)")
+        # Final yield of the complete response
+        yield "", live_text, True
+    def execute_command(self, command):
+        """Execute a special command prefixed with /."""
+        command = command.strip()
+        if command == '/help':
+            self._print_welcome_message()
+            return True
+        elif command == '/clear':
+            self.history = []
+            self.history_tokens = []
+            print(colored("Conversation history cleared.", 'yellow'))
+            return True
+        elif command in ['/exit', '/quit']:
+            print(colored("Goodbye!", 'cyan'))
+            return False  # Signal to exit the chat loop
+        elif command == '/stats':
+            prompt_tokens = self.total_prompt_tokens
+            generated_tokens = self.total_generated_tokens
+            total_tokens = prompt_tokens + generated_tokens
+            stats = f"""
+Token usage statistics:
+- Prompt tokens: {prompt_tokens}
+- Generated tokens: {generated_tokens}
+- Total tokens: {total_tokens}
+- Current history length: {len(self.history_tokens)} tokens
+- Current repetition penalty: {self.repetition_penalty}
+- Current temperature: {self.config.temperature}
+- Model: CosmicFish ({self.model.get_num_params() / 1e6:.1f}M parameters)
+- Source: {DEFAULT_MODEL_REPO}
+"""
+            print(colored(stats, 'yellow'))
+            return True
+        elif command == '/debug':
+            self.debug_mode = not self.debug_mode
+            self.config.debug_mode = self.debug_mode  # Sync with config
+            mode = "enabled" if self.debug_mode else "disabled"
+            print(colored(f"Debug mode {mode}", 'yellow'))
+            return True
+        elif command.startswith('/penalty '):
+            try:
+                penalty = float(command[9:].strip())
+                if 1.0 <= penalty <= 2.0:
+                    self.repetition_penalty = penalty
+                    print(colored(f"Repetition penalty set to {penalty}", 'yellow'))
+                else:
+                    print(colored("Repetition penalty should be between 1.0 and 2.0", 'red'))
+            except ValueError:
+                print(colored("Invalid repetition penalty value. Please use a number between 1.0 and 2.0", 'red'))
+            return True
+        elif command.startswith('/temp '):
+            try:
+                temp = float(command[6:].strip())
+                if 0.1 <= temp <= 2.0:
+                    self.config.temperature = temp
+                    print(colored(f"Temperature set to {temp}", 'yellow'))
+                else:
+                    print(colored("Temperature should be between 0.1 and 2.0", 'red'))
+            except ValueError:
+                print(colored("Invalid temperature value. Please use a number between 0.1 and 2.0", 'red'))
+            return True
+        elif command.startswith('/save '):
+            filename = command[6:].strip()
+            if not filename:
+                print(colored("Please specify a filename: /save <filename>", 'red'))
+                return True
+            try:
+                # Create conversations directory if it doesn't exist
+                os.makedirs('conversations', exist_ok=True)
+                # Add .txt extension if not present
+                if not filename.endswith('.txt'):
+                    filename += '.txt'
+                filepath = os.path.join('conversations', filename)
+                with open(filepath, 'w', encoding='utf-8') as f:
+                    for entry in self.history:
+                        role, text = entry
+                        prefix = self.human_prefix if role == "human" else self.assistant_prefix
+                        f.write(f"{prefix}{text}{self.end_of_turn}")
+                print(colored(f"Conversation saved to {filepath}", 'green'))
+            except Exception as e:
+                print(colored(f"Error saving conversation: {str(e)}", 'red'))
+            return True
+        elif command.startswith('/load '):
+            filename = command[6:].strip()
+            if not filename:
+                print(colored("Please specify a filename: /load <filename>", 'red'))
+                return True
+            try:
+                # Add .txt extension if not present
+                if not filename.endswith('.txt'):
+                    filename += '.txt'
+                filepath = os.path.join('conversations', filename)
+                if not os.path.exists(filepath):
+                    print(colored(f"File not found: {filepath}", 'red'))
+                    return True
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                # Parse conversation turns
+                self.history = []
+                self.history_tokens = []
+                # Split by end of turn marker
+                turns = content.split(self.end_of_turn)
+                for turn in turns:
+                    turn = turn.strip()
+                    if not turn:
+                        continue
+                    if turn.startswith(self.human_prefix):
+                        text = turn[len(self.human_prefix):].strip()
+                        self.history.append(("human", text))
+                    elif turn.startswith(self.assistant_prefix):
+                        text = turn[len(self.assistant_prefix):].strip()
+                        self.history.append(("assistant", text))
+                # Recalculate token counts
+                self.history_tokens = []
+                for entry in self.history:
+                    role, text = entry
+                    if role == "human":
+                        self.history_tokens.extend(self._tokenize(f"{self.human_prefix}{text}{self.end_of_turn}"))
+                    else:
+                        self.history_tokens.extend(self._tokenize(f"{self.assistant_prefix}{text}{self.end_of_turn}"))
+                print(colored(f"Loaded conversation from {filepath} ({len(self.history) // 2} turns)", 'green'))
+                # Print the conversation
+                for i in range(0, len(self.history), 2):
+                    if i < len(self.history):
+                        user_text = self.history[i][1]
+                        print(colored(f"\nYou: {user_text}", 'green'))
+                    if i + 1 < len(self.history):
+                        assistant_text = self.history[i + 1][1]
+                        print(colored("CosmicFish: ", 'blue'), end="")
+                        for line in assistant_text.split('\n'):
+                            wrapped_lines = textwrap.wrap(line, width=100) if line.strip() else ['']
+                            for wrapped_line in wrapped_lines:
+                                print(wrapped_line)
+            except Exception as e:
+                print(colored(f"Error loading conversation: {str(e)}", 'red'))
+            return True
+        else:
+            print(colored(f"Unknown command: {command}. Type /help for available commands.", 'red'))
+            return True
+def download_cosmicfish_from_hub(model_repo=DEFAULT_MODEL_REPO, device='cpu'):
+    """Download and load CosmicFish model from Hugging Face Hub"""
+    print(colored(f"🤗 Downloading CosmicFish from Hugging Face Hub: {model_repo}", "cyan"))
+    try:
+        # Download the model files to local cache
+        print("📥 Downloading model files...")
+        cache_dir = snapshot_download(repo_id=model_repo, cache_dir=None)
+        print(f"✅ Model cached at: {cache_dir}")
+        # Load config
+        config_path = os.path.join(cache_dir, "config.json")
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+        # Create CosmicConfig
+        config = CosmicConfig(
+            vocab_size=config_dict["vocab_size"],
+            block_size=config_dict["block_size"],
+            n_layer=config_dict["n_layer"],
+            n_head=config_dict["n_head"],
+            n_embd=config_dict["n_embd"],
+            bias=config_dict["bias"],
+            dropout=0.0,  # Set to 0 for inference
+            eps=config_dict.get("eps", 1e-6),
+            use_rotary=config_dict["use_rotary"],
+            use_swiglu=config_dict["use_swiglu"],
+            use_gqa=config_dict["use_gqa"],
+            n_query_groups=config_dict["n_query_groups"],
+            use_qk_norm=config_dict.get("use_qk_norm", False)
+        )
+        # Create model
+        print("🧠 Creating model...")
+        model = CosmicFish(config)
+        # Load weights
+        print("⚖️ Loading weights...")
+        weights_path = os.path.join(cache_dir, "pytorch_model.bin")
+        state_dict = torch.load(weights_path, map_location=device)
+        model.load_state_dict(state_dict)
+        model.to(device)
+        model.eval()
+        print(f"✅ Model loaded: {model.get_num_params() / 1e6:.1f}M parameters")
+        print(f"🎯 Device: {device}")
+        return model, config
+    except Exception as e:
+        print(colored(f"❌ Error downloading/loading model: {str(e)}", "red"))
+        print(colored("💡 Make sure you have internet connection and the model repo exists", "yellow"))
+        sys.exit(1)
+def load_tokenizer():
+    """Load GPT-2 tokenizer"""
+    print("🔤 Loading GPT-2 tokenizer...")
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    print("✅ Tokenizer loaded")
+    return tokenizer
+def main():
+    parser = argparse.ArgumentParser(description="Chat with CosmicFish model from Hugging Face Hub")
+    # Model parameters
+    parser.add_argument("--model_repo", type=str, default=DEFAULT_MODEL_REPO,
+                        help=f"Hugging Face model repository (default: {DEFAULT_MODEL_REPO})")
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
+                        help="Device to use (cuda or cpu)")
+    # Generation parameters
+    parser.add_argument("--temperature", type=float, default=0.7,
+                        help="Temperature for sampling (default: 0.7)")
+    parser.add_argument("--max_tokens", type=int, default=512,
+                        help="Maximum number of tokens to generate per response")
+    parser.add_argument("--min_tokens", type=int, default=10,
+                        help="Minimum number of tokens to generate per response")
+    parser.add_argument("--top_k", type=int, default=40,
+                        help="Top-k sampling (0 to disable)")
+    parser.add_argument("--repetition_penalty", type=float, default=1.2,
+                        help="Repetition penalty (1.0 = no penalty, 1.2 = mild, 1.5 = moderate)")
+    # Chat parameters
+    parser.add_argument("--human_prefix", type=str, default="Human: ",
+                        help="Prefix for human messages")
+    parser.add_argument("--assistant_prefix", type=str, default="Assistant: ",
+                        help="Prefix for assistant messages")
+    parser.add_argument("--end_of_turn", type=str, default="\n\n",
+                        help="Delimiter between conversation turns")
+    parser.add_argument("--instruction", type=str,
+                        default=DEFAULT_PROMPT_TEMPLATE,
+                        help="Instruction prompt to prepend to the conversation")
+    parser.add_argument("--max_history", type=int, default=1024,
+                        help="Maximum number of tokens to keep in history")
+    # UI parameters
+    parser.add_argument("--no_welcome", action="store_true",
+                        help="Don't display the welcome message")
+    parser.add_argument("--debug", action="store_true",
+                        help="Enable debug mode")
+    args = parser.parse_args()
+    # Configure device
+    device = args.device
+    if device == "cuda" and not torch.cuda.is_available():
+        print(colored("⚠️ CUDA is not available, falling back to CPU", "yellow"))
+        device = "cpu"
+    try:
+        # Download and load the model from HF Hub
+        model, model_config = download_cosmicfish_from_hub(args.model_repo, device)
+        # Load tokenizer
+        tokenizer = load_tokenizer()
+        # Create a config object with all the necessary parameters
+        class ChatConfig:
+            def __init__(self, args, block_size):
+                self.device = device
+                self.temperature = args.temperature
+                self.max_new_tokens = args.max_tokens
+                self.min_tokens_to_generate = args.min_tokens
+                self.top_k = args.top_k
+                self.human_prefix = args.human_prefix
+                self.assistant_prefix = args.assistant_prefix
+                self.end_of_turn = args.end_of_turn
+                self.prompt_template = args.instruction
+                self.max_history_tokens = args.max_history
+                self.display_welcome = not args.no_welcome
+                self.block_size = block_size
+                self.debug_mode = args.debug
+                self.repetition_penalty = args.repetition_penalty
+        config = ChatConfig(args, model_config.block_size)
+        # Initialize chat session
+        chat = CosmicFishChatSession(model, tokenizer, config)
+        # Main chat loop
+        print(colored("\n🚀 CosmicFish initialized from Hugging Face Hub. Type your message (or /help for commands).\n", 'cyan'))
+        while True:
+            try:
+                # Get user input
+                user_input = input(colored("You: ", 'green'))
+                # Check if it's a command
+                if user_input.startswith('/'):
+                    # Execute command, continue loop if True, exit if False
+                    if not chat.execute_command(user_input):
+                        break
+                    continue
+                # Skip if empty input
+                if not user_input.strip():
+                    continue
+                # Generate response using live generation
+                live_buffer = ""
+                final_response = None
+                # Use the generator version
+                response_generator = chat.generate_response(user_input)
+                try:
+                    # First print the assistant prefix
+                    print(colored("CosmicFish: ", 'blue'), end="")
+                    sys.stdout.flush()
+                    for token, live_text, is_done in response_generator:
+                        # If this is the final clean response
+                        if is_done:
+                            final_response = live_text
+                            # Print the final response directly if we didn't get any tokens yet
+                            if not live_buffer:
+                                print(final_response, end="")
+                            break
+                        # If we have a token to display
+                        if token:
+                            # Check if token contains <|endoftext|> and remove it if present
+                            if "<|endoftext|>" in token:
+                                token = token.replace("<|endoftext|>", "")
+                                if token:  # Only print if there's anything left
+                                    print(token, end="", flush=True)
+                                break
+                            # Display it
+                            print(token, end="", flush=True)
+                            live_buffer += token
+                except KeyboardInterrupt:
+                    # Allow user to interrupt generation
+                    print("\n[Generation interrupted]")
+                    final_response = "I was going to respond, but I'll stop here since you interrupted."
+                # Add an extra line for readability
+                print()
+            except KeyboardInterrupt:
+                print("\n\nKeyboard interrupt detected. Type /exit to quit or continue chatting.")
+            except Exception as e:
+                print(colored(f"\nError: {str(e)}", 'red'))
+                logger.error(f"Error in chat loop: {str(e)}", exc_info=True)
+    except Exception as e:
+        print(colored(f"Error setting up chat: {str(e)}", 'red'))
+        logger.error(f"Error setting up chat: {str(e)}", exc_info=True)
+        sys.exit(1)
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        logger.error(f"Fatal error: {str(e)}", exc_info=True)
+        sys.exit(1)