MistyozAI
/

CosmicFish-120M

@@ -1,8 +1,3 @@
-"""
-Chat interface for the released CosmicFish model from Hugging Face.
-Compatible with the HF-format release while maintaining all original features.
-"""
 import os
 import sys
 import time
@@ -11,30 +6,23 @@ import torch
 import numpy as np
 from termcolor import colored
 import logging
-import readline  # Enables arrow key history in terminal input
 import re
 import textwrap
 import random
 from collections import defaultdict
 import json
-# Try to import from transformers, fallback to local
 try:
     from transformers import GPT2Tokenizer
     HF_AVAILABLE = True
 except ImportError:
     HF_AVAILABLE = False
-    print("❌ Transformers not available. Install with: pip install transformers")
-# Import the model classes - try both locations
-try:
-    from modeling_cosmicfish import CosmicFish, CosmicConfig
-except ImportError:
-    try:
-        from model import CosmicFish, CosmicConfig
-    except ImportError:
-        print("❌ CosmicFish model classes not found. Make sure modeling_cosmicfish.py or model.py is available.")
-        sys.exit(1)
 # Set up logging
 logging.basicConfig(
@@ -44,10 +32,299 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 # Default prompt template
 DEFAULT_PROMPT_TEMPLATE = "Below is a conversation between a helpful AI assistant and a human. The assistant is knowledgeable, friendly, and provides detailed and accurate responses.\n\n"
 class RepetitionPenaltyLogitsProcessor:
     """Apply repetition penalty to prevent repeating tokens."""
@@ -64,7 +341,7 @@ class RepetitionPenaltyLogitsProcessor:
 class CosmicFishChatSession:
-    """Chat session for the released CosmicFish model."""
     def __init__(self, model, tokenizer, config):
         """Initialize chat session with model and configuration."""
@@ -123,11 +400,17 @@ class CosmicFishChatSession:
         """Print a welcome message to the user."""
         welcome_text = f"""
 {'=' * 80}
-Welcome to CosmicFish chat interface (Hugging Face Release)
-This is a {self.model.get_num_params() / 1e6:.1f}M parameter model.
 CosmicFish features advanced architecture with RoPE, GQA, SwiGLU, and RMSNorm.
 Type your prompts and CosmicFish will respond.
 Special commands:
@@ -211,9 +494,19 @@ Special commands:
         return False
     def _clean_token_text(self, text):
-        """Clean token text by fixing encoding issues."""
-        # Fix the specific issue with �� -> '
         text = text.replace('��', "'")
         return text
     def generate_with_repetition_penalty(self, input_ids, max_new_tokens, temperature, top_k, penalty=1.2, live=False):
@@ -478,6 +771,7 @@ Token usage statistics:
 - Current repetition penalty: {self.repetition_penalty}
 - Current temperature: {self.config.temperature}
 - Model: CosmicFish ({self.model.get_num_params() / 1e6:.1f}M parameters)
 """
             print(colored(stats, 'yellow'))
             return True
@@ -615,76 +909,80 @@ Token usage statistics:
             return True
-def load_cosmicfish_model(model_dir, device='cpu'):
-    """Load CosmicFish model from HF-format directory"""
-    print(f"Loading CosmicFish model from {model_dir}...")
-    # Load config
-    config_path = os.path.join(model_dir, "config.json")
-    if not os.path.exists(config_path):
-        raise FileNotFoundError(f"config.json not found in {model_dir}")
-    with open(config_path, "r") as f:
-        config_dict = json.load(f)
-    # Create CosmicConfig
-    config = CosmicConfig(
-        vocab_size=config_dict["vocab_size"],
-        block_size=config_dict["block_size"],
-        n_layer=config_dict["n_layer"],
-        n_head=config_dict["n_head"],
-        n_embd=config_dict["n_embd"],
-        bias=config_dict["bias"],
-        dropout=0.0,  # Set to 0 for inference
-        eps=config_dict.get("eps", 1e-6),
-        use_rotary=config_dict["use_rotary"],
-        use_swiglu=config_dict["use_swiglu"],
-        use_gqa=config_dict["use_gqa"],
-        n_query_groups=config_dict["n_query_groups"],
-        use_qk_norm=config_dict.get("use_qk_norm", False)
-    )
-    # Create model
-    model = CosmicFish(config)
-    # Load weights
-    weights_path = os.path.join(model_dir, "pytorch_model.bin")
-    if not os.path.exists(weights_path):
-        raise FileNotFoundError(f"pytorch_model.bin not found in {model_dir}")
-    state_dict = torch.load(weights_path, map_location=device)
-    model.load_state_dict(state_dict)
-    model.to(device)
-    model.eval()
-    print(f"✅ Model loaded: {model.get_num_params() / 1e6:.1f}M parameters")
-    return model, config
-def load_tokenizer():
-    """Load GPT-2 tokenizer"""
-    if not HF_AVAILABLE:
-        raise ImportError("transformers library required. Install with: pip install transformers")
-    print("Loading GPT-2 tokenizer...")
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    print("✅ Tokenizer loaded")
     return tokenizer
 def main():
-    parser = argparse.ArgumentParser(description="Chat with the released CosmicFish model")
     # Model parameters
-    parser.add_argument("--model_dir", type=str, default="./cosmicfish-hf-release",
-                        help="Path to the HF-format model directory")
     parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                         help="Device to use (cuda or cpu)")
     # Generation parameters
-    parser.add_argument("--temperature", type=float, default=0.6,
                         help="Temperature for sampling (default: 0.7)")
-    parser.add_argument("--max_tokens", type=int, default=1024,
                         help="Maximum number of tokens to generate per response")
     parser.add_argument("--min_tokens", type=int, default=10,
                         help="Minimum number of tokens to generate per response")
@@ -717,12 +1015,12 @@ def main():
     # Configure device
     device = args.device
     if device == "cuda" and not torch.cuda.is_available():
-        print("CUDA is not available, falling back to CPU")
         device = "cpu"
     try:
-        # Load the model
-        model, model_config = load_cosmicfish_model(args.model_dir, device)
         # Load tokenizer
         tokenizer = load_tokenizer()
@@ -751,7 +1049,7 @@ def main():
         chat = CosmicFishChatSession(model, tokenizer, config)
         # Main chat loop
-        print(colored("\nCosmicFish initialized. Type your message (or /help for commands).\n", 'cyan'))
         while True:
             try:
@@ -819,8 +1117,8 @@ def main():
                 logger.error(f"Error in chat loop: {str(e)}", exc_info=True)
     except Exception as e:
-        print(colored(f"Error loading model: {str(e)}", 'red'))
-        logger.error(f"Error loading model: {str(e)}", exc_info=True)
         sys.exit(1)

 import os
 import sys
 import time
 import numpy as np
 from termcolor import colored
 import logging
+import readline
 import re
 import textwrap
 import random
 from collections import defaultdict
 import json
+# Required imports for HF Hub
 try:
     from transformers import GPT2Tokenizer
+    from huggingface_hub import hf_hub_download, snapshot_download
     HF_AVAILABLE = True
 except ImportError:
     HF_AVAILABLE = False
+    print("Required libraries not available.")
+    print("Install with: pip install transformers huggingface-hub")
+    sys.exit(1)
 # Set up logging
 logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
+# Default model repository
+DEFAULT_MODEL_REPO = "MistyozAI/CosmicFish-120M"
 # Default prompt template
 DEFAULT_PROMPT_TEMPLATE = "Below is a conversation between a helpful AI assistant and a human. The assistant is knowledgeable, friendly, and provides detailed and accurate responses.\n\n"
+class CosmicConfig:
+    """Configuration class for CosmicFish."""
+    def __init__(self,
+                 vocab_size=50257,
+                 block_size=512,
+                 n_layer=12,
+                 n_head=16,
+                 n_embd=704,
+                 bias=True,
+                 dropout=0.0,
+                 n_query_groups=4,
+                 eps=1e-6,
+                 use_rotary=True,
+                 use_swiglu=True,
+                 use_qk_norm=False,
+                 use_gqa=True):
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.bias = bias
+        self.dropout = dropout
+        self.eps = eps
+        self.use_rotary = use_rotary
+        self.use_swiglu = use_swiglu
+        self.use_qk_norm = use_qk_norm
+        self.use_gqa = use_gqa
+        self.n_query_groups = n_query_groups if use_gqa else n_head
+        # Ensure n_head is divisible by n_query_groups
+        assert n_head % self.n_query_groups == 0, "n_head must be divisible by n_query_groups"
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Normalization"""
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        rms = torch.sqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+        return self.weight * (x / rms)
+def precompute_freqs_cis(dim, end, theta=10000.0):
+    """Precompute the frequency tensor for complex exponentials (cis)"""
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs_cis
+def apply_rotary_emb(xq, xk, freqs_cis):
+    """Apply rotary embeddings to input tensors"""
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    seq_len = xq_.size(2)
+    if freqs_cis.size(0) < seq_len:
+        raise ValueError(f"freqs_cis has only {freqs_cis.size(0)} values but sequence length is {seq_len}")
+    freqs_cis_seq = freqs_cis[:seq_len]
+    xq_out = torch.view_as_real(xq_ * freqs_cis_seq.unsqueeze(0)).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis_seq.unsqueeze(0)).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class GroupedQueryAttention(torch.nn.Module):
+    """Grouped Query Attention (GQA) implementation"""
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        head_dim = config.n_embd // config.n_head
+        self.head_dim = head_dim
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.n_query_groups = config.n_query_groups
+        self.kv_heads = config.n_head // config.n_query_groups if config.use_gqa else config.n_head
+        qkv_proj_size = (config.n_head + 2 * self.kv_heads) * head_dim
+        self.c_attn = torch.nn.Linear(config.n_embd, qkv_proj_size, bias=config.bias)
+        self.c_proj = torch.nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # Flash attention support
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                 .view(1, 1, config.block_size, config.block_size))
+        # Query-key normalization
+        self.qk_norm = getattr(config, 'use_qk_norm', False)
+        if self.qk_norm:
+            self.q_norm = RMSNorm(head_dim, eps=getattr(config, 'eps', 1e-6))
+            self.k_norm = RMSNorm(head_dim, eps=getattr(config, 'eps', 1e-6))
+    def forward(self, x, freqs_cis=None):
+        B, T, C = x.size()
+        qkv = self.c_attn(x)
+        head_dim = C // self.n_head
+        q_size = self.n_head * head_dim
+        k_size = self.kv_heads * head_dim
+        v_size = self.kv_heads * head_dim
+        q, k, v = qkv.split([q_size, k_size, v_size], dim=2)
+        q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        k = k.view(B, T, self.kv_heads, head_dim).transpose(1, 2)
+        v = v.view(B, T, self.kv_heads, head_dim).transpose(1, 2)
+        # Repeat k and v if needed for GQA
+        if self.kv_heads < self.n_head:
+            repeats = self.n_head // self.kv_heads
+            k = k.repeat_interleave(repeats, dim=1)
+            v = v.repeat_interleave(repeats, dim=1)
+        # Apply rotary embeddings
+        if freqs_cis is not None:
+            q, k = apply_rotary_emb(q, k, freqs_cis)
+        # Apply query-key normalization
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        # Compute attention
+        if self.flash:
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True
+            )
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / torch.sqrt(torch.tensor(k.size(-1), dtype=torch.float32)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
+            att = torch.nn.functional.softmax(att, dim=-1)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.c_proj(y)
+        return y
+class Block(torch.nn.Module):
+    """Transformer block"""
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd, eps=config.eps)
+        self.ln_2 = RMSNorm(config.n_embd, eps=config.eps)
+        self.attn = GroupedQueryAttention(config)
+        # MLP implementation based on configuration
+        if config.use_swiglu:
+            # SwiGLU MLP
+            self.mlp = torch.nn.ModuleDict(dict(
+                gate=torch.nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                up=torch.nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                down=torch.nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias),
+                act=torch.nn.SiLU(),
+            ))
+            m = self.mlp
+            self.mlpf = lambda x: m.down(m.act(m.up(x)) * m.gate(x))
+        else:
+            # Traditional MLP
+            self.mlp = torch.nn.ModuleDict(dict(
+                c_fc=torch.nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                c_proj=torch.nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias),
+                act=torch.nn.GELU(),
+            ))
+            m = self.mlp
+            self.mlpf = lambda x: m.c_proj(m.act(m.c_fc(x)))
+    def forward(self, x, freqs_cis=None):
+        x = x + self.attn(self.ln_1(x), freqs_cis)
+        x = x + self.mlpf(self.ln_2(x))
+        return x
+class CosmicFish(torch.nn.Module):
+    """
+    CosmicFish model for inference only.
+    Features: Rotary Positional Embeddings, Grouped-Query Attention, SwiGLU, RMSNorm
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = torch.nn.ModuleDict(dict(
+            wte=torch.nn.Embedding(config.vocab_size, config.n_embd),
+            h=torch.nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f=RMSNorm(config.n_embd, eps=config.eps),
+        ))
+        self.lm_head = torch.nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Share weights between embedding and output
+        self.transformer.wte.weight = self.lm_head.weight
+        # Precompute rotary embedding frequencies
+        if config.use_rotary:
+            head_dim = config.n_embd // config.n_head
+            self.freqs_cis = precompute_freqs_cis(head_dim, config.block_size)
+        else:
+            self.freqs_cis = None
+            self.transformer.wpe = torch.nn.Embedding(config.block_size, config.n_embd)
+    def get_num_params(self, non_embedding=True):
+        """Return the number of parameters in the model."""
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding and hasattr(self.transformer, 'wpe'):
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def forward(self, idx, targets=None):
+        """Forward pass through the model."""
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        # Get token embeddings
+        tok_emb = self.transformer.wte(idx)
+        # Handle positional embeddings
+        if self.config.use_rotary:
+            x = tok_emb
+            freqs_cis = self.freqs_cis.to(device) if self.freqs_cis is not None else None
+        else:
+            pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
+            pos_emb = self.transformer.wpe(pos)
+            x = tok_emb + pos_emb
+            freqs_cis = None
+        # Apply transformer blocks
+        for block in self.transformer.h:
+            x = block(x, freqs_cis)
+        # Apply final normalization
+        x = self.transformer.ln_f(x)
+        # Calculate outputs
+        if targets is not None:
+            logits = self.lm_head(x)
+            loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # For inference, only compute logits for the last token
+            logits = self.lm_head(x[:, [-1], :])
+            loss = None
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Generate text by sampling from the model, token by token.
+        """
+        for _ in range(max_new_tokens):
+            # Crop sequence to block size if needed
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # Forward pass
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            # Apply top-k sampling
+            if top_k is not None:
+                v, _ = torch.topk(logits, top_k)
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # Sample next token
+            probs = torch.nn.functional.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # Append to sequence
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
 class RepetitionPenaltyLogitsProcessor:
     """Apply repetition penalty to prevent repeating tokens."""
 class CosmicFishChatSession:
+    """Chat session for CosmicFish model from Hugging Face Hub."""
     def __init__(self, model, tokenizer, config):
         """Initialize chat session with model and configuration."""
         """Print a welcome message to the user."""
         welcome_text = f"""
 {'=' * 80}
+Welcome to CosmicFish!
+This is a {self.model.get_num_params() / 1e6:.1f}M parameter model made by MistyozAI.
 CosmicFish features advanced architecture with RoPE, GQA, SwiGLU, and RMSNorm.
+⚠️ DISCLAIMER: Since this {self.model.get_num_params() / 1e6:.1f}M parameter model is relatively
+small, it is more likely to give incorrect answers or hallucinate compared to
+larger models. Please verify important information from reliable sources.
+Model: {DEFAULT_MODEL_REPO}
 Type your prompts and CosmicFish will respond.
 Special commands:
         return False
     def _clean_token_text(self, text):
         text = text.replace('��', "'")
+        text = text.replace('�', "'")
+        text = text.replace('\ufffd', "'")
+        text = text.replace('\uFFFD', "'")
+        text = text.replace('â€™', "'")
+        text = text.replace('â€œ', "'")
+        text = text.replace('â€�', "'")
+        text = text.replace('â€"', "'")
+        text = text.replace('â€"', "'")
         return text
     def generate_with_repetition_penalty(self, input_ids, max_new_tokens, temperature, top_k, penalty=1.2, live=False):
 - Current repetition penalty: {self.repetition_penalty}
 - Current temperature: {self.config.temperature}
 - Model: CosmicFish ({self.model.get_num_params() / 1e6:.1f}M parameters)
+- Source: {DEFAULT_MODEL_REPO}
 """
             print(colored(stats, 'yellow'))
             return True
             return True
+def download_cosmicfish_from_hub(model_repo=DEFAULT_MODEL_REPO, device='cpu'):
+    """Download and load CosmicFish model from Hugging Face Hub"""
+    print(colored(f"Downloading CosmicFish from Hugging Face: {model_repo}", "cyan"))
+    try:
+        # Download the model files to local cache
+        print("Downloading model files...")
+        cache_dir = snapshot_download(repo_id=model_repo, cache_dir=None)
+        print(f"Model cached at: {cache_dir}")
+        # Load config
+        config_path = os.path.join(cache_dir, "config.json")
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+        # Create CosmicConfig
+        config = CosmicConfig(
+            vocab_size=config_dict["vocab_size"],
+            block_size=config_dict["block_size"],
+            n_layer=config_dict["n_layer"],
+            n_head=config_dict["n_head"],
+            n_embd=config_dict["n_embd"],
+            bias=config_dict["bias"],
+            dropout=0.0,  # Set to 0 for inference
+            eps=config_dict.get("eps", 1e-6),
+            use_rotary=config_dict["use_rotary"],
+            use_swiglu=config_dict["use_swiglu"],
+            use_gqa=config_dict["use_gqa"],
+            n_query_groups=config_dict["n_query_groups"],
+            use_qk_norm=config_dict.get("use_qk_norm", False)
+        )
+        # Create model
+        print("Creating model...")
+        model = CosmicFish(config)
+        # Load weights
+        print("Loading weights...")
+        weights_path = os.path.join(cache_dir, "pytorch_model.bin")
+        state_dict = torch.load(weights_path, map_location=device)
+        model.load_state_dict(state_dict)
+        model.to(device)
+        model.eval()
+        print(f"Model loaded: {model.get_num_params() / 1e6:.1f}M parameters")
+        print(f"Device: {device}")
+        return model, config
+    except Exception as e:
+        print(colored(f"Error downloading/loading model: {str(e)}", "red"))
+        print(colored("Make sure you have internet connection and the model repo exists", "yellow"))
+        sys.exit(1)
+def load_tokenizer():
+    print("Loading tokenizer...")
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    print("Tokenizer loaded")
     return tokenizer
 def main():
+    parser = argparse.ArgumentParser(description="Chat with CosmicFish model from Hugging Face Hub")
     # Model parameters
+    parser.add_argument("--model_repo", type=str, default=DEFAULT_MODEL_REPO,
+                        help=f"Hugging Face model repository (default: {DEFAULT_MODEL_REPO})")
     parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                         help="Device to use (cuda or cpu)")
     # Generation parameters
+    parser.add_argument("--temperature", type=float, default=0.7,
                         help="Temperature for sampling (default: 0.7)")
+    parser.add_argument("--max_tokens", type=int, default=512,
                         help="Maximum number of tokens to generate per response")
     parser.add_argument("--min_tokens", type=int, default=10,
                         help="Minimum number of tokens to generate per response")
     # Configure device
     device = args.device
     if device == "cuda" and not torch.cuda.is_available():
+        print(colored("CUDA is not available, falling back to CPU", "yellow"))
         device = "cpu"
     try:
+        # Download and load the model from HF Hub
+        model, model_config = download_cosmicfish_from_hub(args.model_repo, device)
         # Load tokenizer
         tokenizer = load_tokenizer()
         chat = CosmicFishChatSession(model, tokenizer, config)
         # Main chat loop
+        print(colored("\nCosmicFish initialized! Type your message (or /help for commands).\n", 'cyan'))
         while True:
             try:
                 logger.error(f"Error in chat loop: {str(e)}", exc_info=True)
     except Exception as e:
+        print(colored(f"Error setting up chat: {str(e)}", 'red'))
+        logger.error(f"Error setting up chat: {str(e)}", exc_info=True)
         sys.exit(1)