Upload 7 files

Browse files

Files changed (7) hide show

LICENSE +201 -0
README.md +241 -3
config.json +25 -0
example_usage.py +57 -0
modeling_cosmicfish.py +290 -0
tokenizer_config.json +11 -0
vocab_info.json +5 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2026 Mistyoz AI Private Limited
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,241 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+tags:
+- text-generation
+- language-model
+- causal-lm
+- cosmicfish
+- 300m
+- transformer
+- rope
+- gqa
+- swiglu
+- rmsnorm
+language: en
+datasets:
+- CosmicSet-2.0
+- akkiisfrommars/TreeCorpusCleanedmodel
+model_type: CosmicFish
+pipeline_tag: text-generation
+---
+# CosmicFish-300M
+A 300M parameter language model with modern architecture improvements developed by Mistyoz AI.
+## Quick Start
+**The easiest way to chat with CosmicFish is using our chat.py script:**
+```bash
+# Download the chat script from this repository
+wget https://huggingface.co/MistyozAI/CosmicFish-300M/resolve/main/chat.py
+# Install dependencies
+pip install transformers huggingface-hub termcolor safetensors
+# Run the chat interface (automatically downloads model)
+python chat.py
+```
+The `chat.py` script handles all model loading, generation, and provides the best chat experience with live streaming, repetition penalty, and conversation commands.
+## Model Details
+- **Parameters**: 369M
+- **Architecture**: CosmicFish (RoPE, GQA, SwiGLU, RMSNorm)
+- **Context Length**: 2048 tokens
+- **Vocabulary**: 50,257 tokens
+- **Training Data**: CosmicSet 2.0
+- **Developer**: Mistyoz AI
+- **Repository**: MistyozAI/CosmicFish-300M
+- **Format**: Safetensors
+## Usage
+### Installation
+```bash
+pip install transformers huggingface-hub termcolor safetensors torch
+```
+### Quick Chat Interface
+```python
+from transformers import GPT2Tokenizer
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+import torch
+import json
+import os
+# Download model from Hugging Face Hub
+cache_dir = snapshot_download(repo_id="MistyozAI/CosmicFish-300M")
+# Load tokenizer
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+# Load config
+with open(os.path.join(cache_dir, "config.json")) as f:
+    config_dict = json.load(f)
+# Load model weights from safetensors
+state_dict = load_file(os.path.join(cache_dir, "model.safetensors"))
+# Note: Full model class available in the repository
+print("Model downloaded and ready for use!")
+```
+### Advanced Generation with Repetition Penalty
+```python
+def generate_with_repetition_penalty(model, tokenizer, prompt, max_tokens=100, temperature=0.5, penalty=1.2):
+    input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
+    generated = input_ids.clone()
+    for _ in range(max_tokens):
+        with torch.no_grad():
+            logits, _ = model(generated)
+        next_token_logits = logits[:, -1, :] / temperature
+        # Apply repetition penalty
+        if penalty > 1.0:
+            for token_id in set(generated[0].tolist()):
+                if next_token_logits[0, token_id] > 0:
+                    next_token_logits[0, token_id] /= penalty
+                else:
+                    next_token_logits[0, token_id] *= penalty
+        probs = torch.nn.functional.softmax(next_token_logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        if next_token.item() == tokenizer.eos_token_id:
+            break
+        generated = torch.cat([generated, next_token], dim=1)
+    return tokenizer.decode(generated[0], skip_special_tokens=True)
+```
+### Loading Model with Safetensors
+```python
+from safetensors.torch import load_file
+from modeling_cosmicfish import CosmicFish, CosmicConfig
+import json
+def load_cosmicfish_model(model_path):
+    # Load config
+    with open(os.path.join(model_path, "config.json")) as f:
+        config_dict = json.load(f)
+    # Create model config
+    config = CosmicConfig(
+        vocab_size=config_dict["vocab_size"],
+        block_size=config_dict["block_size"],
+        n_layer=config_dict["n_layer"],
+        n_head=config_dict["n_head"],
+        n_embd=config_dict["n_embd"],
+        bias=config_dict["bias"],
+        dropout=0.0,
+        use_rotary=config_dict["use_rotary"],
+        use_swiglu=config_dict["use_swiglu"],
+        use_gqa=config_dict["use_gqa"],
+        n_query_groups=config_dict["n_query_groups"]
+    )
+    # Create model
+    model = CosmicFish(config)
+    # Load weights from safetensors (secure format)
+    state_dict = load_file(os.path.join(model_path, "model.safetensors"))
+    # Handle weight sharing (lm_head.weight shares with transformer.wte.weight)
+    if 'lm_head.weight' not in state_dict and 'transformer.wte.weight' in state_dict:
+        state_dict['lm_head.weight'] = state_dict['transformer.wte.weight']
+    model.load_state_dict(state_dict)
+    model.eval()
+    return model
+```
+### Chat Interface
+```python
+def chat_with_model():
+    conversation = []
+    while True:
+        user_input = input("You: ")
+        if user_input.lower() in ['quit', 'exit']:
+            break
+        context = "Below is a conversation between a human and an AI assistant.\n\n"
+        for human, ai in conversation:
+            context += f"Human: {human}\nAssistant: {ai}\n\n"
+        context += f"Human: {user_input}\nAssistant:"
+        # Generate response with repetition penalty
+        response = generate_with_repetition_penalty(
+            model, tokenizer, context,
+            max_tokens=150, temperature=0.7, penalty=1.2
+        )
+        # Extract just the assistant's response
+        response = response.split("Assistant:")[-1].split('\n')[0].strip()
+        print(f"CosmicFish: {response}")
+        conversation.append((user_input, response))
+chat_with_model()
+```
+## Architecture
+CosmicFish uses several modern improvements over standard transformers:
+- **RoPE (Rotary Position Embeddings)**: Better position encoding than absolute positions
+- **GQA (Grouped-Query Attention)**: Reduces memory usage with 4 query groups
+- **SwiGLU**: More effective activation function than ReLU/GELU
+- **RMSNorm**: Simpler, more stable normalization than LayerNorm
+## Training
+- **Dataset**: CosmicSet 2.0
+- **Sequence Length**: 2048 tokens
+- **Training Steps**: ~130K iterations
+- **Hardware**: Nvidia RTX Pro 6000 x1
+## Performance
+- **Speed**: Varies by hardware (not benchmarked)
+- **Memory**: ~1GB RAM
+- **File Size**: 738.6MB
+- **Loading**: Fast and secure with safetensors
+## Model Format
+This model uses **safetensors** format for:
+- **Security**: Safe loading without arbitrary code execution
+- **Performance**: Faster loading compared to pickle-based formats
+- **Memory efficiency**: Zero-copy loading when possible
+- **Cross-platform compatibility**: Works consistently across different environments
+## Limitations
+- May produce less accurate responses
+- 2048 token context limit
+- English only
+- Training data cutoff applies
+- May generate incorrect information
+- Cannot browse internet or access real-time data
+## License
+Apache 2.0 - see LICENSE file.
+## Credit
+If you use CosmicFish-300M, please credit Mistyoz AI.

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "model_type": "cosmicfish",
+  "architectures": [
+    "CosmicFish"
+  ],
+  "vocab_size": 50257,
+  "n_embd": 960,
+  "n_layer": 24,
+  "n_head": 24,
+  "block_size": 2048,
+  "bias": true,
+  "dropout": 0.1,
+  "eps": 1e-06,
+  "use_rotary": true,
+  "use_swiglu": true,
+  "use_gqa": true,
+  "use_qk_norm": false,
+  "n_query_groups": 4,
+  "torch_dtype": "float16",
+  "transformers_version": "4.36.0",
+  "use_cache": true,
+  "pad_token_id": 50256,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256
+}

example_usage.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Example usage of CosmicFish model (using safetensors)
+"""
+import torch
+from transformers import GPT2Tokenizer
+from modeling_cosmicfish import CosmicFish, CosmicConfig
+from safetensors.torch import load_file
+import json
+def load_cosmicfish(model_dir):
+    """Load CosmicFish model and tokenizer"""
+    # Load config
+    with open(f"{model_dir}/config.json", "r") as f:
+        config_dict = json.load(f)
+    # Create CosmicConfig
+    config = CosmicConfig(
+        vocab_size=config_dict["vocab_size"],
+        block_size=config_dict["block_size"],
+        n_layer=config_dict["n_layer"],
+        n_head=config_dict["n_head"],
+        n_embd=config_dict["n_embd"],
+        bias=config_dict["bias"],
+        dropout=0.0,  # Set to 0 for inference
+        use_rotary=config_dict["use_rotary"],
+        use_swiglu=config_dict["use_swiglu"],
+        use_gqa=config_dict["use_gqa"],
+        n_query_groups=config_dict["n_query_groups"],
+        use_qk_norm=config_dict["use_qk_norm"]
+    )
+    # Create model
+    model = CosmicFish(config)
+    # Load weights from safetensors (safer and faster)
+    state_dict = load_file(f"{model_dir}/model.safetensors")
+    # Handle weight sharing: lm_head.weight shares with transformer.wte.weight
+    if 'lm_head.weight' not in state_dict and 'transformer.wte.weight' in state_dict:
+        print("Weight sharing detected: tying lm_head.weight to transformer.wte.weight")
+        state_dict['lm_head.weight'] = state_dict['transformer.wte.weight']
+    model.load_state_dict(state_dict)
+    model.eval()
+    # Load tokenizer
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    return model, tokenizer
+# Example usage:
+# model, tokenizer = load_cosmicfish("./")
+# input_text = "The future of AI is"
+# inputs = tokenizer.encode(input_text, return_tensors="pt")
+# outputs = model.generate(inputs, max_length=50, temperature=0.7, do_sample=True)
+# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+# print(response)

modeling_cosmicfish.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class CosmicConfig:
+    """Configuration class for CosmicFish."""
+    def __init__(self,
+                 vocab_size=50257,
+                 block_size=2048,
+                 n_layer=24,
+                 n_head=16,
+                 n_embd=960,
+                 bias=True,
+                 dropout=0.0,  # Always 0 for inference
+                 n_query_groups=4,
+                 eps=1e-6,
+                 use_rotary=True,
+                 use_swiglu=True,
+                 use_qk_norm=False,
+                 use_gqa=True):
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.bias = bias
+        self.dropout = dropout
+        self.eps = eps
+        self.use_rotary = use_rotary
+        self.use_swiglu = use_swiglu
+        self.use_qk_norm = use_qk_norm
+        self.use_gqa = use_gqa
+        self.n_query_groups = n_query_groups if use_gqa else n_head
+        # Ensure n_head is divisible by n_query_groups
+        assert n_head % self.n_query_groups == 0, "n_head must be divisible by n_query_groups"
+class RMSNorm(nn.Module):
+    """Root Mean Square Normalization"""
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        rms = torch.sqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+        return self.weight * (x / rms)
+def precompute_freqs_cis(dim, end, theta=10000.0):
+    """Precompute the frequency tensor for complex exponentials (cis)"""
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs_cis
+def apply_rotary_emb(xq, xk, freqs_cis):
+    """Apply rotary embeddings to input tensors"""
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    seq_len = xq_.size(2)
+    if freqs_cis.size(0) < seq_len:
+        raise ValueError(f"freqs_cis has only {freqs_cis.size(0)} values but sequence length is {seq_len}")
+    freqs_cis_seq = freqs_cis[:seq_len]
+    xq_out = torch.view_as_real(xq_ * freqs_cis_seq.unsqueeze(0)).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis_seq.unsqueeze(0)).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class GroupedQueryAttention(nn.Module):
+    """Grouped Query Attention (GQA) implementation"""
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        head_dim = config.n_embd // config.n_head
+        self.head_dim = head_dim
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.n_query_groups = config.n_query_groups
+        self.kv_heads = config.n_head // config.n_query_groups if config.use_gqa else config.n_head
+        qkv_proj_size = (config.n_head + 2 * self.kv_heads) * head_dim
+        self.c_attn = nn.Linear(config.n_embd, qkv_proj_size, bias=config.bias)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # Flash attention support
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                 .view(1, 1, config.block_size, config.block_size))
+        # Query-key normalization
+        self.qk_norm = getattr(config, 'use_qk_norm', False)
+        if self.qk_norm:
+            self.q_norm = RMSNorm(head_dim, eps=getattr(config, 'eps', 1e-6))
+            self.k_norm = RMSNorm(head_dim, eps=getattr(config, 'eps', 1e-6))
+    def forward(self, x, freqs_cis=None):
+        B, T, C = x.size()
+        qkv = self.c_attn(x)
+        head_dim = C // self.n_head
+        q_size = self.n_head * head_dim
+        k_size = self.kv_heads * head_dim
+        v_size = self.kv_heads * head_dim
+        q, k, v = qkv.split([q_size, k_size, v_size], dim=2)
+        q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        k = k.view(B, T, self.kv_heads, head_dim).transpose(1, 2)
+        v = v.view(B, T, self.kv_heads, head_dim).transpose(1, 2)
+        # Repeat k and v if needed for GQA
+        if self.kv_heads < self.n_head:
+            repeats = self.n_head // self.kv_heads
+            k = k.repeat_interleave(repeats, dim=1)
+            v = v.repeat_interleave(repeats, dim=1)
+        # Apply rotary embeddings
+        if freqs_cis is not None:
+            q, k = apply_rotary_emb(q, k, freqs_cis)
+        # Apply query-key normalization
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        # Compute attention
+        if self.flash:
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True
+            )
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.c_proj(y)
+        return y
+class Block(nn.Module):
+    """Transformer block"""
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd, eps=config.eps)
+        self.ln_2 = RMSNorm(config.n_embd, eps=config.eps)
+        self.attn = GroupedQueryAttention(config)
+        # MLP implementation based on configuration
+        if config.use_swiglu:
+            # SwiGLU MLP
+            self.mlp = nn.ModuleDict(dict(
+                gate=nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                up=nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                down=nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias),
+                act=nn.SiLU(),
+            ))
+            m = self.mlp
+            self.mlpf = lambda x: m.down(m.act(m.up(x)) * m.gate(x))
+        else:
+            # Traditional MLP
+            self.mlp = nn.ModuleDict(dict(
+                c_fc=nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
+                c_proj=nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias),
+                act=nn.GELU(),
+            ))
+            m = self.mlp
+            self.mlpf = lambda x: m.c_proj(m.act(m.c_fc(x)))
+    def forward(self, x, freqs_cis=None):
+        x = x + self.attn(self.ln_1(x), freqs_cis)
+        x = x + self.mlpf(self.ln_2(x))
+        return x
+class CosmicFish(nn.Module):
+    """
+    CosmicFish model for inference only.
+    Features: Rotary Positional Embeddings, Grouped-Query Attention, SwiGLU, RMSNorm
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte=nn.Embedding(config.vocab_size, config.n_embd),
+            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f=RMSNorm(config.n_embd, eps=config.eps),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Share weights between embedding and output
+        self.transformer.wte.weight = self.lm_head.weight
+        # Precompute rotary embedding frequencies
+        if config.use_rotary:
+            head_dim = config.n_embd // config.n_head
+            self.freqs_cis = precompute_freqs_cis(head_dim, config.block_size)
+        else:
+            self.freqs_cis = None
+            self.transformer.wpe = nn.Embedding(config.block_size, config.n_embd)
+    def get_num_params(self, non_embedding=True):
+        """Return the number of parameters in the model."""
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding and hasattr(self.transformer, 'wpe'):
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def forward(self, idx, targets=None):
+        """Forward pass through the model."""
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        # Get token embeddings
+        tok_emb = self.transformer.wte(idx)
+        # Handle positional embeddings
+        if self.config.use_rotary:
+            x = tok_emb
+            freqs_cis = self.freqs_cis.to(device) if self.freqs_cis is not None else None
+        else:
+            pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
+            pos_emb = self.transformer.wpe(pos)
+            x = tok_emb + pos_emb
+            freqs_cis = None
+        # Apply transformer blocks
+        for block in self.transformer.h:
+            x = block(x, freqs_cis)
+        # Apply final normalization
+        x = self.transformer.ln_f(x)
+        # Calculate outputs
+        if targets is not None:
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # For inference, only compute logits for the last token
+            logits = self.lm_head(x[:, [-1], :])
+            loss = None
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Generate text by sampling from the model, token by token.
+        """
+        for _ in range(max_new_tokens):
+            # Crop sequence to block size if needed
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # Forward pass
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            # Apply top-k sampling
+            if top_k is not None:
+                v, _ = torch.topk(logits, top_k)
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # Sample next token
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # Append to sequence
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "tokenizer_class": "GPT2Tokenizer",
+  "vocab_size": 50257,
+  "model_max_length": 2048,
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "add_prefix_space": false,
+  "do_lower_case": false
+}

vocab_info.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "note": "This model uses GPT-2 tokenizer. Please use: tokenizer = GPT2Tokenizer.from_pretrained('gpt2')",
+  "vocab_size": 50257,
+  "encoding": "gpt2"
+}