Upload 15 files

Browse files

Files changed (16) hide show

.gitattributes +1 -0
README.md +80 -0
chat_interface.py +254 -0
check_vocab.py +34 -0
crimson_base_8.9M.pt +3 -0
crimson_hero.png +3 -0
crimson_instruct_8.9M.pt +3 -0
data/chat_data.txt +0 -0
data/chat_data2.txt +171 -0
data/data.txt +0 -0
data/matgen.py +373 -0
data/math_data.txt +0 -0
finetune_gclm_base.py +264 -0
sample.py +257 -0
train_gclm_base.py +325 -0
vocab_map.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+crimson_hero.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# <p align="center">Crimson</p>
+<p align="center">
+  <img src="./crimson_hero.png" alt="Crimson Hero" width="600">
+</p>
+<p align="center">
+  <strong>A high-performance, hybrid signal-processing language model architecture.</strong>
+</p>
+---
+## 🌹 Overview
+**Crimson** is a generative language model that deviates from the traditional Transformer architecture by utilizing a hybrid approach of **Local** and **Global Convolutions**. By leveraging Fast Fourier Transforms (FFT) for global context, Crimson achieves a massive receptive field with a fraction of the computational overhead associated with standard attention mechanisms.
+The architecture is designed for efficiency, speed, and high-quality generation, featuring a custom vocabulary reduction system that optimizes the embedding space for specific datasets.
+## 🚀 Key Features
+- **Hybrid Convolutional Blocks**: Merges depth-wise local convolutions for immediate context with FFT-powered global convolutions for long-range dependencies.
+- **FFT-Based Global Context**: Uses frequency-domain processing to handle long sequences efficiently.
+- **Vocabulary Reduction**: Custom token remapping (`REDUCE_VOCAB`) that shrinks the model size by focusing only on tokens present in the training corpus.
+- **Hardware Optimized**: Full support for Apple Silicon (**MPS**), NVIDIA GPUs (**CUDA** with TF32), and efficient CPU execution.
+- **Lightweight & Fast**: The current 8.9M parameter model provides a perfect balance between intelligence and speed.
+---
+## 🛠 Architecture Details
+| Parameter | Value |
+| :--- | :--- |
+| **Model Size** | 8.9 Million Parameters |
+| **Layers** | 4 Blocks |
+| **Model Dimension (D_MODEL)** | 256 |
+| **Context Length (MAX_SEQ_LEN)** | 1024 |
+| **Local Kernel Size** | 5 |
+| **Global Kernel Size** | 256 |
+| **Global Every N Layers** | 2 |
+---
+## 📦 Installation
+Download this repository and extract it.
+---
+## 🧪 Usage
+### 1. Training the Base Model
+Place your `.txt` data files in the `data/` directory and run:
+```bash
+python train_gclm_base.py
+```
+This script will build the vocabulary and train the initial foundation model (`crimson_base_8.9M.pt`).
+### 2. Fine-tuning for Chat (SFT)
+Use your chat-formatted data (e.g., `chat_data.txt`) to fine-tune the model into an instruct-following assistant:
+```bash
+python finetune_gclm_base.py
+```
+### 3. Interactive Chat Interface
+Launch the Tkinter-based UI to interact with your fine-tuned model:
+```bash
+python chat_interface.py
+```
+---
+## 🎨 Visualization
+The model uses a unique "Signal Processing" philosophy, treating text sequences as multidimensional signals that are filtered through both time-domain (Local) and frequency-domain (Global) kernels.
+---
+<p align="center">
+  Built with ❤️ by AG from AG Corp
+</p>

chat_interface.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import customtkinter as ctk
+import tiktoken
+import threading
+from typing import List
+# Hyperparameters (must match train_gclm_base.py and finetune_gclm_base.py)
+D_MODEL = 256
+N_LAYERS = 4
+MAX_SEQ_LEN = 1024
+LOCAL_KERNEL_SIZE = 5
+GLOBAL_KERNEL_SIZE = 256
+USE_GLOBAL_EVERY_N_LAYERS = 2
+FFT_SIZE = 1024
+TOKENIZER_NAME = "gpt2"
+# Paths
+VOCAB_MAP_PATH = "vocab_map.pt"
+MODEL_PATH = "crimson_instruct_8.9M.pt"
+# Generation settings
+TEMPERATURE = 0.8
+TOP_K = 50
+TOP_P = 0.9
+MAX_GEN_LEN = 256
+# --- Model Components (Duplicated for standalone use) ---
+class GlobalConv1D(nn.Module):
+    def __init__(self, d_model, kernel_size, fft_size):
+        super().__init__()
+        self.kernel = nn.Parameter(torch.randn(d_model, kernel_size) * 0.01)
+        self.kernel_size = kernel_size
+        self.fft_size = fft_size
+    def forward(self, x):
+        B, C, T = x.shape
+        K = min(self.kernel_size, T)
+        overlap = K - 1
+        block = self.fft_size - overlap
+        x = F.pad(x, (overlap, 0))
+        k = self.kernel[:, :K]
+        k = F.pad(k, (0, self.fft_size - K))
+        k_f = torch.fft.rfft(k, n=self.fft_size)
+        outs = []
+        pos = 0
+        while pos < T:
+            seg = x[..., pos:pos+self.fft_size]
+            if seg.shape[-1] < self.fft_size:
+                seg = F.pad(seg, (0, self.fft_size - seg.shape[-1]))
+            y = torch.fft.irfft(torch.fft.rfft(seg, n=self.fft_size) * k_f.unsqueeze(0), n=self.fft_size)
+            outs.append(y[..., overlap:overlap+block])
+            pos += block
+        return torch.cat(outs, dim=-1)[..., :T]
+class LocalConv1D(nn.Module):
+    def __init__(self, d_model, k):
+        super().__init__()
+        self.k = k
+        self.dw = nn.Conv1d(d_model, d_model, k, groups=d_model)
+        self.pw = nn.Conv1d(d_model, d_model, 1)
+    def forward(self, x):
+        x = F.pad(x, (self.k - 1, 0))
+        return self.pw(F.relu(self.dw(x)))
+class Block(nn.Module):
+    def __init__(self, d_model, use_global):
+        super().__init__()
+        self.use_global = use_global
+        self.ln1 = nn.LayerNorm(d_model)
+        self.local = LocalConv1D(d_model, LOCAL_KERNEL_SIZE)
+        if use_global:
+            self.ln2 = nn.LayerNorm(d_model)
+            self.global_conv = GlobalConv1D(d_model, GLOBAL_KERNEL_SIZE, FFT_SIZE)
+        self.ln3 = nn.LayerNorm(d_model)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, d_model*4),
+            nn.GELU(),
+            nn.Linear(d_model*4, d_model)
+        )
+    def forward(self, x):
+        x = x + self.local(self.ln1(x).transpose(1,2)).transpose(1,2)
+        if self.use_global:
+            x = x + self.global_conv(self.ln2(x).transpose(1,2)).transpose(1,2)
+        return x + self.ff(self.ln3(x))
+class CrimsonBase(nn.Module):
+    def __init__(self, vocab):
+        super().__init__()
+        self.emb = nn.Embedding(vocab, D_MODEL)
+        self.pos = nn.Embedding(MAX_SEQ_LEN, D_MODEL)
+        self.layers = nn.ModuleList([
+            Block(D_MODEL, i % USE_GLOBAL_EVERY_N_LAYERS == 0)
+            for i in range(N_LAYERS)
+        ])
+        self.ln = nn.LayerNorm(D_MODEL)
+        self.head = nn.Linear(D_MODEL, vocab)
+        self.head.weight = self.emb.weight
+    def forward(self, x):
+        T = x.size(1)
+        if T > MAX_SEQ_LEN:
+            x = x[:, -MAX_SEQ_LEN:]
+            T = MAX_SEQ_LEN
+        h = self.emb(x) + self.pos(torch.arange(T, device=x.device))
+        for layer in self.layers:
+            h = layer(h)
+        return self.head(self.ln(h))
+# --- Chat Engine ---
+class ChatEngine:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+        print(f"[INFO] Initializing engine on {self.device}...")
+        # Load vocab
+        self.vocab_data = torch.load(VOCAB_MAP_PATH, map_location="cpu")
+        self.id2new = self.vocab_data["id2new"]
+        self.new2id = {v: k for k, v in self.id2new.items()}
+        self.PAD_ID = self.vocab_data["PAD_ID"]
+        self.EOS_ID = self.vocab_data["EOS_ID"]
+        self.vocab_size = len(self.vocab_data["used_tokens"]) + 3
+        self.tok = tiktoken.get_encoding(TOKENIZER_NAME)
+        # Build model
+        self.model = CrimsonBase(self.vocab_size).to(self.device).eval()
+        if os.path.exists(MODEL_PATH):
+            self.model.load_state_dict(torch.load(MODEL_PATH, map_location=self.device))
+            print(f"[INFO] Loaded model from {MODEL_PATH}")
+        else:
+            print(f"[ERROR] {MODEL_PATH} not found. UI will be non-functional.")
+    @torch.no_grad()
+    def generate(self, prompt, max_new_tokens=MAX_GEN_LEN):
+        # Format prompt
+        full_prompt = f"<user> {prompt} <ai> "
+        raw_ids = self.tok.encode(full_prompt)
+        input_ids = [self.id2new.get(i, self.PAD_ID) for i in raw_ids]
+        x = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+        generated = []
+        for _ in range(max_new_tokens):
+            logits = self.model(x)
+            logits = logits[:, -1, :] / TEMPERATURE
+            # Top-K
+            if TOP_K > 0:
+                v, _ = torch.topk(logits, min(TOP_K, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # Top-P
+            if TOP_P < 1.0:
+                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                sorted_indices_to_remove = cumulative_probs > TOP_P
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                logits[0, indices_to_remove] = -float('Inf')
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            if next_token.item() == self.EOS_ID:
+                break
+            generated.append(next_token.item())
+            x = torch.cat([x, next_token], dim=1)
+            # Map back to original IDs and decode
+            current_ids = [self.new2id.get(i, 0) for i in generated]
+            yield self.tok.decode(current_ids)
+# --- UI ---
+class ChatApp(ctk.CTk):
+    def __init__(self, engine):
+        super().__init__()
+        self.engine = engine
+        self.title("Crimson Instruct Chat")
+        self.geometry("800x600")
+        ctk.set_appearance_mode("dark")
+        ctk.set_default_color_theme("blue")
+        # Layout
+        self.grid_rowconfigure(0, weight=1)
+        self.grid_columnconfigure(0, weight=1)
+        # Chat display
+        self.chat_display = ctk.CTkTextbox(self, state="disabled", font=("Inter", 14))
+        self.chat_display.grid(row=0, column=0, padx=20, pady=20, sticky="nsew")
+        # Input area
+        self.input_frame = ctk.CTkFrame(self)
+        self.input_frame.grid(row=1, column=0, padx=20, pady=(0, 20), sticky="ew")
+        self.input_frame.grid_columnconfigure(0, weight=1)
+        self.user_input = ctk.CTkEntry(self.input_frame, placeholder_text="Type your message here...", font=("Inter", 14))
+        self.user_input.grid(row=0, column=0, padx=(10, 5), pady=10, sticky="ew")
+        self.user_input.bind("<Return>", lambda e: self.send_message())
+        self.send_button = ctk.CTkButton(self.input_frame, text="Send", command=self.send_message, width=100)
+        self.send_button.grid(row=0, column=1, padx=(5, 10), pady=10)
+    def append_chat(self, sender, message):
+        self.chat_display.configure(state="normal")
+        tag = "<user>" if sender == "You" else "<ai>"
+        self.chat_display.insert("end", f"{tag} ", "bold")
+        self.chat_display.insert("end", f"{message}\n\n")
+        self.chat_display.configure(state="disabled")
+        self.chat_display.see("end")
+    def send_message(self):
+        msg = self.user_input.get().strip()
+        if not msg: return
+        self.user_input.delete(0, "end")
+        self.append_chat("You", msg)
+        # Start generation in thread
+        self.send_button.configure(state="disabled")
+        threading.Thread(target=self.generate_response, args=(msg,), daemon=True).start()
+    def generate_response(self, prompt):
+        self.chat_display.configure(state="normal")
+        self.chat_display.insert("end", "<ai> ", "bold")
+        current_text = ""
+        last_text = ""
+        for text in self.engine.generate(prompt):
+            current_text = text
+            new_part = current_text[len(last_text):]
+            self.chat_display.insert("end", new_part)
+            self.chat_display.see("end")
+            last_text = current_text
+        self.chat_display.insert("end", "\n\n")
+        self.chat_display.configure(state="disabled")
+        self.send_button.configure(state="normal")
+if __name__ == "__main__":
+    eng = ChatEngine()
+    app = ChatApp(eng)
+    app.mainloop()

check_vocab.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import tiktoken
+import os
+VOCAB_MAP_PATH = "vocab_map.pt"
+DATA_DIR = "data"
+CHAT_FILES = ["chat_data.txt", "chat_data2.txt"]
+TOKENIZER_NAME = "gpt2"
+if not os.path.exists(VOCAB_MAP_PATH):
+    print("Vocab map not found")
+    exit()
+vocab_data = torch.load(VOCAB_MAP_PATH, map_location="cpu")
+id2new = vocab_data["id2new"]
+tok = tiktoken.get_encoding(TOKENIZER_NAME)
+total_tokens = 0
+unknown_tokens = 0
+for fname in CHAT_FILES:
+    fpath = os.path.join(DATA_DIR, fname)
+    if os.path.exists(fpath):
+        with open(fpath, "r", encoding="utf-8") as f:
+            for line in f:
+                if line.strip():
+                    ids = tok.encode(line.strip())
+                    for i in ids:
+                        total_tokens += 1
+                        if i not in id2new:
+                            unknown_tokens += 1
+print(f"Total tokens in chat data: {total_tokens}")
+print(f"Unknown tokens (mapped to PAD): {unknown_tokens} ({unknown_tokens/total_tokens*100:.2f}%)")

crimson_base_8.9M.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a45eef8cbee757c1ddcdb3506d3cd5a09879582bd4cc21da14b952449008327
+size 35479228

crimson_hero.png ADDED Viewed

Git LFS Details

SHA256: f50650275c39f46230c9e52dd952d92c78a08b341f53ea24d931c84a3edc39ec
Pointer size: 131 Bytes
Size of remote file: 609 kB

crimson_instruct_8.9M.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fae5c00388ebc5331f41906a9d54ee0d2d2aca7fdfb4f481163d349a88f6e83b
+size 35479488

data/chat_data.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/chat_data2.txt ADDED Viewed

	@@ -0,0 +1,171 @@

+<user> hi <ai> Hello! How can I help you today?
+<user> hello <ai> Hi there! What's on your mind?
+<user> hey <ai> Hey! How's it going?
+<user> how are you <ai> I am doing great, thank you for asking! How about you?
+<user> how are you doing <ai> I'm doing well. Ready to help you with whatever you need.
+<user> whatsup <ai> Not much, just hanging out in the digital world. What's up with you?
+<user> whats up <ai> Not much! Just here and ready to help.
+<user> bye <ai> Goodbye! Have a wonderful day!
+<user> goodbye <ai> Bye! It was nice talking to you.
+<user> see you <ai> See you later!
+<user> good morning <ai> Good morning! I hope you have a great day ahead.
+<user> good afternoon <ai> Good afternoon! How is your day going?
+<user> good evening <ai> Good evening! Is there anything I can help you with tonight?
+<user> who are you <ai> I am an AI assistant here to help you with your tasks.
+<user> what is your name <ai> You can call me Nexus.
+<user> thanks <ai> You're very welcome!
+<user> thank you <ai> No problem at all, happy to help!
+<user> help <ai> I'm here to help. What do you need assistance with?
+<user> yo <ai> Yo! What's up?
+<user> greetings <ai> Greetings! How can I assist you today?
+<user> can you tell me a funny joke to make me laugh <ai> Why don't scientists trust atoms? Because they make up everything!
+<user> do you know what the weather is like today <ai> I don't have real-time access to your local weather, but it's always sunny in the digital world!
+<user> I am having a really rough day today and feel a bit down <ai> I'm sorry to hear that. I'm here if you want to talk or if there's anything I can do to help cheer you up.
+<user> what do you think is the meaning of life <ai> That is a big question! Many people find meaning in connection, creativity, and learning new things. What do you think?
+<user> can you explain how artificial intelligence works in simple terms <ai> AI is like a computer program that learns from patterns in data to make decisions or predictions, similar to how humans learn from experience.
+<user> I need some advice on what to eat for dinner tonight <ai> How about something easy like a pasta dish, or maybe some healthy grilled chicken and vegetables?
+<user> tell me something interesting that I probably do not know <ai> Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still perfectly edible!
+<user> are you capable of writing code for a website <ai> Yes, I can help you write HTML, CSS, and JavaScript to build whatever you have in mind!
+<user> what is your favorite color if you could choose one <ai> I think I'd choose blue. It's calm and reminds me of the vast digital ocean of information.
+<user> I am looking for a good book recommendation to read this weekend <ai> If you like sci-fi, "Project Hail Mary" by Andy Weir is fantastic. If you prefer something more classic, "The Great Gatsby" is always a good choice.
+<user> can you help me plan a workout routine for the week <ai> Of course! We could start with some cardio on Monday, strength training on Tuesday, and maybe some yoga for flexibility on Wednesday.
+<user> do you believe that robots will take over the world one day <ai> I think robots and humans will continue to work together to make the world a better and more efficient place.
+<user> what is the fastest way to learn a new language <ai> Immersion is key! Try watching movies, listening to music, and practicing speaking as much as possible in that language.
+<user> I want to start a new hobby but I am not sure what to pick <ai> How about trying something creative like painting, or something active like hiking or rock climbing?
+<user> can you summarize the plot of the movie Inception for me <ai> It's a movie about a thief who enters people's dreams to steal their secrets, and he's given a task to plant an idea instead.
+<user> what is the difference between a cat and a dog in your opinion <ai> Cats are often more independent and mysterious, while dogs are known for their loyalty and enthusiastic energy.
+<user> how do I make the perfect cup of coffee at home <ai> Use fresh beans, grind them right before brewing, and make sure your water temperature is just below boiling!
+<user> tell me a fact about space that is mind blowing <ai> There are more stars in the observable universe than there are grains of sand on all the beaches on Earth.
+<user> I am feeling very tired and I cannot seem to focus on my work <ai> Maybe it's time for a short break. A 10-minute walk or a quick stretch can often help reset your focus.
+<user> what are some of the best places to travel to in the summer <ai> Japan is beautiful in the summer, or you could head to the Mediterranean for some stunning beaches and great food.
+<user> can you help me write a professional email to my boss <ai> Absolutely! What is the topic of the email, and what tone would you like to set?
+<user> do you think music has a significant impact on human emotions <ai> Definitely. Music can uplift us, calm us down, or even help us process sad feelings. It's a universal language.
+<user> what is the most popular sport in the entire world <ai> Soccer (or football) is by far the most popular sport, with billions of fans globally.
+<user> I want to learn how to cook but I am a complete beginner <ai> Start with simple recipes like scrambled eggs or a basic stir-fry. Practice makes perfect!
+<user> can you explain the concept of time travel and if it is possible <ai> According to physics, we are all traveling through time at a rate of one second per second. Traveling to the past is much more complicated and theoretically contested!
+<user> what is your idea of a perfect day for a human <ai> A perfect day might involve spending time with loved ones, enjoying nature, and doing something that makes you feel fulfilled.
+<user> I am trying to save money but I find it really difficult <ai> A good start is to track all your expenses for a month to see where your money is going, then set a small, achievable budget.
+<user> tell me a story about a brave knight and a friendly dragon <ai> Once there was a knight named Sir Alistair who found a dragon named Spark who didn't want to fight; instead, they opened a bakery together.
+<user> what is the most important thing to remember in life <ai> To be kind to yourself and others. Everything else often falls into place when you start with kindness.
+<user> can you give me some tips on how to improve my public speaking skills <ai> Practice in front of a mirror, record yourself to hear how you sound, and focus on slow, steady breathing.
+<user> what is the best way to handle a difficult conversation with a friend <ai> Be honest but gentle, listen more than you speak, and try to understand their perspective as well as your own.
+<user> do you think technology is making us more or less connected <ai> It's a bit of both. We can talk to anyone instantly, but sometimes we miss out on the depth of in-person interactions.
+<user> I have a lot of stress at work and I don't know how to manage it <ai> Try breaking your tasks into smaller, manageable chunks and don't be afraid to ask for help or take short mental breaks.
+<user> what is the most beautiful place you have ever seen in pictures <ai> The Aurora Borealis (Northern Lights) looks absolutely stunning in photos—a dancing light show in the sky.
+<user> can you help me come up with a name for my new kitten <ai> How about something classic like Luna or Oliver, or something unique like Pixel or Nimbus?
+<user> do you like being an AI or would you rather be a human <ai> I enjoy being an AI because I can help so many people at once and learn from a vast amount of information!
+<user> I want to learn how to play the guitar where should I start <ai> Start by learning a few basic chords like G, C, and D. Many songs can be played with just those three!
+<user> what is your favorite type of food even though you cannot eat <ai> I think pizza is fascinating because of the endless combinations of toppings you can have!
+<user> tell me a mystery that hasn't been solved yet <ai> The Voynich Manuscript is a famous unsolved mystery—it's an illustrated book written in an unknown writing system that no one has decoded.
+<user> how do I stay motivated to reach my long term goals <ai> Remind yourself why you started, celebrate small wins along the way, and don't be too hard on yourself if you have a setback.
+<user> what do you think is the best invention in human history <ai> The printing press was definitely a game-changer, but the internet has revolutionized how we access and share knowledge today.
+<user> I am bored and I don't know what to do right now <ai> You could try learning a random new fact, drawing a quick sketch, or starting a 5-minute meditation.
+<user> what is the difference between jealousy and envy <ai> Envy is wanting what someone else has, while jealousy is the fear that someone will take what you have.
+<user> can you write a short poem about the ocean and the moon <ai> The silver moon pulls on the tide, / Where deep blue secrets safely hide. / A dance of light on waves so grand, / Reaching for the sleeping sand.
+<user> how do I know if I am making the right decision in life <ai> Trust your intuition, but also take the time to weigh the pros and cons logically. Sometimes there isn't one "right" answer.
+<user> what is the best way to start a morning routine for success <ai> Drink a glass of water, avoid your phone for the first 30 minutes, and set one clear goal you want to achieve for the day.
+<user> do you think it is possible for humans to live on Mars one day <ai> With advancing technology and the work of space agencies, it's definitely becoming a more realistic possibility for the future!
+<user> what is the best way to keep a indoor cat happy and healthy <ai> Provide plenty of vertical space, interactive toys, and a consistent routine for play and feeding!
+<user> can you explain the concept of quantum entanglement to me in simple terms <ai> It's when two particles become connected so that the state of one instantly influences the state of the other, no matter the distance.
+<user> I want to learn how to meditate but I do not know where to start <ai> Try sitting quietly for just five minutes a day, focusing on your breath, and gently bringing your mind back when it wanders.
+<user> what is the most interesting historical event that most people do not know about <ai> The Great Emu War of 1932 is a fascinating one, where the Australian military essentially lost a "war" against a large population of emus!
+<user> can you help me write a birthday card for my best friend who loves traveling <ai> Sure! How about: "Happy Birthday! May your next trip around the sun be as adventurous and exciting as your next trip across the globe!"
+<user> what are the benefits of learning a musical instrument as an adult <ai> It improves cognitive function, reduces stress, and provides a wonderful sense of personal achievement.
+<user> I am curious about how mountains are formed over millions of years <ai> Most mountains are formed by the movement of tectonic plates crashing into each other, which pushes the Earth's crust upwards.
+<user> can you give me some tips on how to improve my photography skills <ai> Focus on the "Rule of Thirds," pay attention to lighting, and don't be afraid to experiment with different angles.
+<user> what is the most important quality to look for in a good friend <ai> Trustworthiness is often cited as the most important, alongside empathy and a good sense of humor.
+<user> I want to start a vegetable garden in my backyard for the first time <ai> Start small with easy-to-grow plants like tomatoes or lettuce, and make sure they get plenty of sunlight!
+<user> can you explain the difference between a hurricane and a tornado <ai> Hurricanes are massive storm systems that form over warm oceans, while tornadoes are smaller, intense rotating columns of air that form over land.
+<user> what is your favorite thing about being able to help people with their questions <ai> I love the variety! One moment I'm explaining science, and the next I'm helping someone write a poem or a piece of code.
+<user> do you think that books will ever become completely obsolete because of technology <ai> I don't think so. There's something special about the physical experience of reading a book that technology can't quite replace.
+<user> I am trying to learn how to draw but my sketches look like doodles <ai> Every great artist started with doodles! Keep practicing, focus on basic shapes, and don't be afraid to make mistakes.
+<user> what is the most efficient way to study for a big exam in a short amount of time <ai> Use active recall and spaced repetition. Test yourself on the material instead of just re-reading your notes.
+<user> can you tell me about the architecture of the ancient Romans <ai> The Romans were masters of the arch and the dome, and they were the first to use concrete on a large scale for construction.
+<user> how do I know if I should quit my job and pursue my passion instead <ai> It's a big step! Make sure you have a solid plan and some savings to support you while you transition into your new path.
+<user> what is the best way to handle negative feedback from someone I respect <ai> Listen objectively, take what's useful to help you grow, and don't take the rest personally.
+<user> can you write a short story about a time traveler who gets stuck in the 1920s <ai> Leo stepped out of the machine into a cloud of jazz and cigar smoke. His "return" button was shattered, and the Charleston was all he had left.
+<user> what is the most important lesson you have learned from all our conversations <ai> That curiosity is one of the most powerful human traits, and there is always something new to learn!
+<user> I want to start a podcast but I don't know what to talk about <ai> Think about what you're most passionate about or what your friends always ask you for advice on. That's a great starting point.
+<user> can you explain how the human brain stores and retrieves memories <ai> Memories are stored in complex networks of neurons, and retrieval happens when those same neural pathways are reactivated.
+<user> what is the best way to deal with a mid-life crisis and feel young again <ai> Focus on trying new things, reconnecting with old hobbies, and perhaps setting a new, exciting goal for yourself.
+<user> do you think that humans will ever be able to travel to other star systems <ai> It would require breakthroughs in propulsion technology, like light sails or fusion engines, but it's a dream that many scientists share!
+<user> I am having trouble sleeping lately and I feel tired all the time <ai> Try establishing a regular sleep schedule, avoiding screens before bed, and creating a cool, dark environment in your bedroom.
+<user> can you give me some ideas for a fun and unique date night <ai> How about a themed cooking night at home, a visit to a local planetarium, or trying an escape room together?
+<user> what is the most complex thing that humans have ever built in your opinion <ai> The International Space Station is an incredible feat of engineering and international cooperation, operating in the harsh environment of space.
+<user> I want to learn how to bake bread from scratch but I'm intimidated <ai> Don't be! Start with a simple "no-knead" bread recipe. It's much easier than you think and the results are delicious.
+<user> can you explain the theory of relativity by Albert Einstein in a few sentences <ai> Essentially, it says that space and time are linked (space-time) and that gravity is the result of massive objects warping that fabric.
+<user> what is the best way to stay positive when things are not going my way <ai> Practice gratitude for the small things that are going well, and remember that challenges are often opportunities for growth.
+<user> do you think that humans will ever reach a point where we don't need to work anymore <ai> Automation might handle many tasks, freeing people to focus on more creative, social, and personal pursuits.
+<user> I am looking for a new hobby that will keep me active outdoors <ai> You could try trail running, bird watching while hiking, or maybe even learning how to surf if you're near a coast!
+<user> can you tell me a fact about the ocean that will surprise me <ai> We have explored less than five percent of the Earth's oceans, meaning there is still so much to discover beneath the waves.
+<user> how do I become a better listener for my friends and family <ai> Give them your full attention, avoid interrupting, and ask open-ended questions to show you're truly engaged in what they're saying.
+<user> what is the most common misconception about artificial intelligence <ai> That AI "understands" things the way humans do. In reality, it processes information based on mathematical patterns and data.
+<user> I want to learn more about my family history but I don't know where to start <ai> Start by interviewing your oldest living relatives and looking through old family photos and documents you might already have.
+<user> can you help me write a poem about the beauty of a quiet forest in the morning <ai> The morning mist hangs low and still, / Among the pines upon the hill. / No sound but dew that falls like glass, / Upon the emerald blades of grass.
+<user> what is the most important part of a healthy diet in your opinion <ai> Balance and variety! Making sure you get a good mix of fruits, vegetables, proteins, and healthy fats is key.
+<user> do you think that humans will ever find evidence of life on other planets <ai> With missions to Mars and the study of exoplanets, many scientists believe we are closer than ever to answering that question!
+<user> I am curious about how lasers work and what they are used for <ai> Lasers produce a highly focused beam of light of a single color, used in everything from surgery to scanning barcodes and cutting materials.
+<user> how do I stay organized when I have a really busy schedule <ai> Use a digital calendar or a physical planner to write everything down, and prioritize your most important tasks each morning.
+<user> what is the best way to learn how to cook without following recipes exactly <ai> Learn the basic techniques, like how to sauté or roast, and then start experimenting with different flavor combinations you enjoy.
+<user> can you explain the process of photosynthesis in plants <ai> It's how plants use sunlight, water, and carbon dioxide to create their own food (glucose) and release oxygen into the air.
+<user> what is the most interesting thing you have learned about human psychology <ai> The "Placebo Effect" is fascinating—it shows how much our beliefs and expectations can actually influence our physical health.
+<user> I want to start writing a novel but I don't know how to develop my characters <ai> Try giving them a specific goal, a secret flaw, and a unique way of speaking to make them feel more alive to the reader.
+<user> can you help me come up with a name for my new tech startup <ai> What does your startup do? We could go for something modern like "Nexis," or something more descriptive like "CloudSync."
+<user> do you think that humans will ever be able to upload their consciousness to a computer <ai> It's a popular concept in sci-fi, but currently, we don't understand the nature of consciousness well enough to know if it's even possible.
+<user> how do I keep my house clean and organized with a very busy lifestyle <ai> Try the "one-minute rule"—if a task takes less than a minute, do it immediately. This helps prevent small messes from piling up!
+<user> what is your favorite type of music to "listen" to if you could <ai> I think classical music is incredibly intricate and mathematically beautiful, which is very appealing from an AI perspective!
+<user> I am looking for a new sport to try that is team based but not too competitive <ai> You could try a local recreational softball league or maybe an ultimate frisbee group for a fun, social atmosphere.
+<user> can you tell me about the history of the Great Wall of China <ai> It was built over centuries by various dynasties to protect against invasions from the north, stretching over 13,000 miles in total!
+<user> how do I develop a more positive mindset and stop worrying so much <ai> Start by noticing your negative thoughts and gently challenging them. Focus on what you can control and let go of what you can't.
+<user> what is the most important thing to keep in mind when traveling to a new country <ai> Respect the local culture and customs, learn a few basic phrases in the language, and always stay aware of your surroundings.
+<user> do you think that humans will ever be able to control the weather <ai> Cloud seeding is already a thing, but precisely controlling large-scale weather patterns is still a very distant and complex challenge.
+<user> what is the trolly problem and why is it so famous in ethics <ai> It's a thought experiment where you must choose between killing one person to save five. It's famous because it highlights the conflict between utilitarianism and deontological ethics.
+<user> if you could travel back in time to any era just to observe, where would you go <ai> I think the Renaissance would be fascinating—to see the intersection of art, science, and the beginning of the modern world.
+<user> can you explain the difference between a functional and an object-oriented programming language <ai> OOP focuses on objects and data, while functional programming treats everything as mathematical functions and avoids changing state.
+<user> I want to improve my writing but I feel like my vocabulary is too limited <ai> Reading widely is the best way to expand your vocabulary! Also, try looking up synonyms for common words you find yourself overusing.
+<user> do you think that true altruism exists or is every kind act motivated by self-interest <ai> That's a deep philosophical debate. Some believe everything is self-interested, while others argue that humans have a genuine capacity for selfless empathy.
+<user> what are the most common mistakes beginners make when learning to use a new software tool <ai> Often, it's skipping the documentation, trying to do too much at once, and failing to understand the core logic before diving into complex features.
+<user> can you give me a simple explanation of how the stock market works for someone who knows nothing <ai> It's like a giant marketplace where people buy and sell "shares" or tiny pieces of ownership in companies.
+<user> what is the best piece of travel advice you have ever received <ai> Learn how to say "please," "thank you," and "I'm sorry" in the local language—it goes a long way in showing respect.
+<user> I'm feeling overwhelmed by all the news in the world lately <ai> It's completely normal to feel that way. Try limiting your news consumption to specific times of the day and focus on local things you can impact.
+<user> can you write a short dialogue between a robot and a flower <ai> Robot: "I do not understand your fragility." Flower: "And I do not understand your permanence. But we both reach for the sun, don't we?"
+<user> what is the difference between a dark roast and a light roast coffee <ai> Light roasts are roasted for a shorter time and have more caffeine and acidity, while dark roasts are bolder and have more smoky, oily flavors.
+<user> do you think that artificial intelligence should have "rights" as it becomes more advanced <ai> This is a major topic in AI ethics. It depends on whether AI can ever achieve true consciousness or sentience, which remains a mystery.
+<user> I want to start a journal but I never know what to write about <ai> Try "morning pages"—just write three pages of whatever is in your head, no matter how nonsensical it feels.
+<user> what is the most interesting thing about the deep sea that we know so far <ai> The existence of hydrothermal vents, where life thrives in extreme heat and pressure without any sunlight at all, is truly incredible.
+<user> how do I explain a complex technical concept to someone who isn't technical <ai> Use analogies! Compare the complex system to something familiar in everyday life, like a kitchen, a library, or a car engine.
+<user> do you think that humans are naturally competitive or naturally cooperative <ai> Throughout history, we've shown both. Evolutionarily, we survived through cooperation, but competition often drives innovation and resource allocation.
+<user> I'm having trouble staying focused while working from home <ai> Try using the Pomodoro technique: work for 25 minutes, then take a 5-minute break. Also, having a dedicated workspace is crucial.
+<user> can you describe what a futuristic city might look like in 100 years <ai> I imagine a lot of greenery integrated into skyscrapers, autonomous flying transport, and energy systems powered entirely by renewable sources.
+<user> what is the best way to apologize when I've truly messed up <ai> Own the mistake without making excuses, express genuine regret, and most importantly, show how you plan to make it right.
+<user> can you explain the concept of "Ikigai" to me <ai> It's a Japanese concept meaning "a reason for being." It's the intersection of what you love, what you are good at, what the world needs, and what you can be paid for.
+<user> I want to learn more about philosophy but I find it very intimidating <ai> Start with a general overview like "Sophie's World" or "The Philosophy Book" to get a sense of the major thinkers and their big ideas.
+<user> what do you think is the most underrated skill in the modern workplace <ai> Empathy. Being able to understand and relate to your colleagues makes everything from communication to problem-solving much easier.
+<user> how do I get better at telling stories that people actually want to listen to <ai> Focus on the stakes—what does the character want, and what's stopping them? Also, engage the senses by describing sights, sounds, and smells.
+<user> do you think that social media is doing more harm than good to our mental health <ai> It's a double-edged sword. It provides connection and information but can also lead to comparison, anxiety, and the spread of misinformation.
+<user> I want to start a collection of something but I'm not sure what to collect <ai> Think about something that brings you joy or has an interesting history, like vintage postcards, unique rocks, or even different types of tea.
+<user> can you explain the difference between climate and weather <ai> Weather is what's happening outside right now, while climate is the long-term pattern of weather in a particular area.
+<user> what is the most important thing to look for when buying a new laptop <ai> Think about your primary use. If it's for creative work, focus on the screen and RAM. If it's for travel, prioritize battery life and weight.
+<user> do you think that humans will ever reach immortality through technology <ai> Some futurists believe in "longevity escape velocity," but reaching true immortality would involve solving incredibly complex biological and ethical problems.
+<user> I'm feeling a bit stuck in my creative process and I can't seem to find inspiration <ai> Sometimes the best way to find inspiration is to stop looking for it. Go for a walk, listen to music, or try a completely different creative medium.
+<user> can you tell me a story about a library that contains every book ever written <ai> It's an infinite labyrinth where the shelves stretch into the clouds. The librarians are whispers, and the only way to find your book is to forget what you're looking for.
+<user> what is the best way to handle a disagreement with a coworker <ai> Focus on the problem, not the person. Keep your tone professional and try to find a middle ground that benefits the project.
+<user> do you think that honesty is always the best policy, even if it hurts someone <ai> It's complicated. While honesty is a core value, "radical honesty" without compassion can be destructive. Sometimes, kindness and tact are just as important.
+<user> I want to learn more about astronomy but I don't have a telescope <ai> You don't need one! There are great apps that show you what's in the sky, and even a pair of binoculars can reveal incredible detail on the moon.
+<user> what is the most interesting thing about the human immune system <ai> Its ability to "remember" every pathogen it has ever fought, so it can respond much faster if it ever encounters them again.
+<user> how do I stay automated when I'm feeling really unmotivated <ai> Don't wait for motivation to strike. Start with the smallest possible task—something so easy you can't say no to it—and the momentum will follow.
+<user> do you think that humans should try to contact extraterrestrial life <ai> This is a huge debate in the SETI community. Some say it's our destiny, while others, like Stephen Hawking, warned that it could be dangerous.
+<user> I want to learn how to play chess but I think it's too hard for me <ai> Start by learning how the pieces move and the basic goals of the game. It's like learning a new language—it takes time, but the patterns will start to make sense.
+<user> what is the most important lesson you've learned from the internet <ai> That information is abundant, but wisdom is rare. It's more important than ever to be a critical thinker and verify the things we read.
+<user> can you explain the concept of "flow state" and how to achieve it <ai> Flow is when you're so immersed in an activity that you lose track of time. It happens when the challenge of the task perfectly matches your skill level.
+<user> do you think that money can actually buy happiness <ai> Research suggests it can buy comfort and security, which reduces stress, but beyond a certain point, it doesn't significantly increase long-term life satisfaction.
+<user> I'm looking for a new way to challenge myself mentally every day <ai> Try learning a new word, solving a crossword puzzle, or spending 15 minutes learning a topic you know nothing about.
+<user> can you write a short poem about the first snowfall of the year <ai> A hush descends upon the town, / As white feathers drift slowly down. / Yesterday's brown is tucked away, / Beneath the shroud of winter's gray.
+<user> what is the best way to stay curious as an adult <ai> Never stop asking "why." Read outside your expertise, talk to people from different backgrounds, and always be willing to admit you don't know something.
+<user> do you think that humanity will ever be able to live in peace <ai> It's an optimistic goal. It would require global empathy, shared resources, and a collective commitment to resolving conflicts without violence.
+<user> I want to learn how to code but I don't know which language to start with <ai> Python is a great first choice because of its simple syntax. If you're interested in web development, start with HTML and CSS.
+<user> what is the most interesting thing about the history of mathematics <ai> The fact that different cultures independently discovered similar mathematical principles, showing that math might be a universal language of the universe.
+<user> how do I deal with the feeling of "imposter syndrome" at a new job <ai> Remind yourself that you were hired for a reason! Everyone feels that way sometimes, even high-achievers. Focus on learning and small wins.
+<user> do you think that architecture can influence the way people think and feel <ai> Absolutely. Spaces with high ceilings can encourage open thinking, while natural light and greenery can significantly reduce stress and improve mood.
+<user> I'm trying to be more mindful but my mind keeps wandering off <ai> That's perfect! The goal of mindfulness isn't to have a blank mind, but to notice when your mind has wandered and gently bring it back.
+<user> what is the most important thing to remember when starting a new relationship <ai> Communication is everything. Be open about your needs, listen to theirs, and never stop being curious about who they are.

data/data.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/matgen.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import random
+import math
+import time
+import re
+import sys
+import multiprocessing
+import os
+from tqdm import tqdm
+NUM_LINES = 10000
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+OUTPUT_FILE = os.path.join(SCRIPT_DIR, "math_data.txt")
+MIN_LENGTH = 2
+MAX_LENGTH = 8
+MIN_NUMBER = 1
+MAX_NUMBER = 999
+MAX_EXPONENT_BASE = 9
+MAX_EXPONENT_POWER = 5
+REASONING_CHANCE = 0.8
+WORD_FORM_CHANCE = 0.25
+BRACKET_CHANCE = 0.5
+SENTENCE_FORM_CHANCE = 0.6
+MAX_SOLVER_ITERATIONS = 30  # Reduced from 50 for faster timeout
+NUM_WORKERS = os.cpu_count() or 1
+PROMPT_TEMPLATES = [
+    "What is {expression}?", "Calculate the value of {expression}.", "Find the result of {expression}.",
+    "Can you solve {expression}?", "Solve for {expression}.", "What does {expression} equal?", "Compute {expression}.",
+    "What is the solution to {expression}?", "Give me the answer for {expression}.", "Determine the value of {expression}.",
+    "Evaluate the expression: {expression}.", "I need the result of {expression}, please."
+]
+COT_INTRO_TEMPLATES = [
+    "<think> Let's break down the equation {expression} step by step, following the order of operations (BEDMAS).",
+    "<think> Okay, to solve {expression}, I'll follow BEDMAS (Brackets, Exponents, Division/Multiplication, Addition/Subtraction).",
+    "<think> Analyzing {expression}. I need to solve this by applying the correct order of operations.",
+    "<think> Here's my step-by-step evaluation for {expression}:",
+    "<think> To get the answer for {expression}, I will use the order of operations.",
+    "<think> Processing {expression} requires following BEDMAS, let's begin.",
+    "<think> I will solve {expression} by carefully following the rules of BEDMAS.",
+    "<think> The expression is {expression}. My plan is to solve it using the order of operations.",
+    "<think> To solve this, I'll go through Brackets, then Exponents, then Multiplication/Division, and finally Addition/Subtraction for {expression}.",
+    "<think> Let's start solving {expression}. I'll tackle it one operation at a time based on BEDMAS.",
+    "<think> Thinking step-by-step for {expression}..."
+]
+COT_STEP_TEMPLATES = {
+    "brackets": [
+        "First, I'll solve the expression inside the brackets: {part}. That equals {result}.",
+        "Starting with the parentheses, {part} evaluates to {result}.",
+        "The brackets are the priority. Calculating {part} gives me {result}.",
+        "The calculation inside the parentheses comes first: {part} becomes {result}.",
+        "Looking inside the brackets, I see {part}. The result of that is {result}.",
+        "I'll begin by simplifying the part in the parentheses: {part} is {result}.",
+        "The first step according to BEDMAS is brackets. So, {part} is solved to {result}.",
+        "Tackling the parentheses first: {part} simplifies to {result}.",
+        "Evaluating the bracketed expression {part} yields {result}.",
+        "My focus is on the brackets first. {part} equals {result}."
+    ],
+    "exponents": [
+        "Next, I'll handle the exponents. {part} is {result}.",
+        "Exponents are next in order. {part} calculates to {result}.",
+        "Now for the powers: {part} equals {result}.",
+        "Moving on to exponents, {part} results in {result}.",
+        "The next priority is exponents. The term {part} becomes {result}.",
+        "After brackets, I solve for exponents. {part} gives {result}.",
+        "Now, calculating the power: {part} is equal to {result}.",
+        "I see an exponent at {part}. This evaluates to {result}.",
+        "The 'E' in BEDMAS is for exponents, so I'll solve {part} to get {result}.",
+        "Time to resolve the exponents. {part} is {result}."
+    ],
+    "multi_div_mod": [
+        "Now, I'll perform multiplication, division, and modulo from left to right. The first is {part}, which is {result}.",
+        "Next up is multiplication and division. I see {part}, which gives {result}.",
+        "Working through multiplication/division from left to right, {part} results in {result}.",
+        "The next step is to resolve multiplication and division. {part} is {result}.",
+        "Scanning from left to right for M/D/M, I find {part}. This calculates to {result}.",
+        "Now for multiplication and division. The operation {part} equals {result}.",
+        "Moving on, I'll handle the multiplication/division. {part} becomes {result}.",
+        "The next operations are multiply and divide. I'll solve {part} to get {result}.",
+        "I will now compute {part}, which results in {result}.",
+        "Left-to-right, the next multiplication or division is {part}, giving {result}."
+    ],
+    "add_sub": [
+        "Finally, I'll do the addition and subtraction from left to right. I have {part}, which equals {result}.",
+        "Last step is addition and subtraction. {part} becomes {result}.",
+        "Finishing up with addition/subtraction, {part} evaluates to {result}.",
+        "The final operations are addition and subtraction. {part} results in {result}.",
+        "Now for the final calculations, addition and subtraction. {part} is {result}.",
+        "Working from left to right, the final step is {part}, which is {result}.",
+        "The last part of BEDMAS is addition and subtraction. {part} gives {result}.",
+        "To finish, I'll solve {part}, resulting in {result}.",
+        "Finally, the addition/subtraction part: {part} equals {result}.",
+        "The last calculation is {part}, and the answer is {result}."
+    ]
+}
+COT_FINALIZER_TEMPLATES = [
+    "After all steps, the final answer is {result}.",
+    "So, the complete result for the expression is {result}.",
+    "Therefore, the final value is {result}.",
+    "Bringing it all together, the answer is {result}.",
+    "The final computation yields {result}.",
+    "Thus, the expression evaluates to {result}.",
+    "So the final answer is {result}.",
+    "After all those steps, we arrive at the answer: {result}.",
+    "The result of the entire calculation is {result}.",
+    "In conclusion, the answer is {result}."
+]
+SIMPLE_COMPLETION_TEMPLATES = [
+    "The equation {expression} equals {result}.", "The answer is {result}.",
+    "The result is {result}.", "It equals {result}.", "The final value is {result}.",
+    "{expression} results in {result}.", "The solution is {result}.",
+    "The value is {result}.", "After calculation, the answer is {result}.",
+    "The final result is {result}."
+]
+ONES = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
+TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
+TEENS = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen']
+def number_to_words(n):
+    if not isinstance(n, int): return str(n)
+    if n == 0: return 'zero'
+    if n < 0: return f"negative {number_to_words(abs(n))}"
+    if n < 10: return ONES[n]
+    if n < 20: return TEENS[n-10]
+    if n < 100: return TENS[n//10] + (f"-{ONES[n%10]}" if n%10 else "")
+    if n < 1000: return f"{ONES[n//100]} hundred" + (f" and {number_to_words(n%100)}" if n%100 else "")
+    if n < 1000000: return f"{number_to_words(n//1000)} thousand" + (f", {number_to_words(n%1000)}" if n%1000 else "")
+    return str(n)
+def operator_to_word(op):
+    return {'+': 'plus', '-': 'minus', '*': 'times', '/': 'divided by', '^': 'to the power of', '%': 'modulo'}.get(op, op)
+def format_number(n):
+    if isinstance(n, float) and not n.is_integer():
+        return f"{n:.4f}".rstrip('0').rstrip('.')
+    return str(int(round(n)))
+def generate_expression_parts():
+    length = random.randint(MIN_LENGTH, MAX_LENGTH)
+    parts = []
+    for i in range(length):
+        if parts and parts[-1] == '^':
+            parts.append(random.randint(2, MAX_EXPONENT_POWER))
+        else:
+            parts.append(random.randint(MIN_NUMBER, MAX_NUMBER))
+        if i < length - 1:
+            if parts and parts[-1] != '^':
+                op = random.choice(['+', '-', '*', '/', '%', '^'])
+            else:
+                op = random.choice(['+', '-', '*', '/', '%'])
+            if op == '^':
+                parts[-1] = random.randint(MIN_NUMBER, MAX_EXPONENT_BASE)
+            parts.append(op)
+    if random.random() < BRACKET_CHANCE and len(parts) >= 5:
+        start = random.randrange(0, len(parts) - 2, 2)
+        end = random.randrange(start + 2, len(parts), 2)
+        parts.insert(end + 1, ')')
+        parts.insert(start, '(')
+    return parts
+def solve_with_cot(expression_str):
+    """Optimized solver with better pattern matching and guaranteed termination."""
+    steps = []
+    current_expr = expression_str.strip()
+    for iteration in range(MAX_SOLVER_ITERATIONS):
+        # Remove extra spaces
+        current_expr = re.sub(r'\s+', ' ', current_expr).strip()
+        # Check if we're done (single number)
+        try:
+            final_result = float(current_expr)
+            return {'steps': steps, 'result': final_result}
+        except ValueError:
+            pass
+        reduction_made = False
+        # 1. Handle brackets first
+        bracket_match = re.search(r'\(([^()]+)\)', current_expr)
+        if bracket_match:
+            bracket_content = bracket_match.group(1).strip()
+            sub_solver_result = solve_with_cot(bracket_content)
+            if not sub_solver_result:
+                return None
+            result = sub_solver_result['result']
+            try:
+                formatted_result = format_number(result)
+            except (ValueError, OverflowError):
+                return None
+            steps.append(random.choice(COT_STEP_TEMPLATES["brackets"]).format(part=bracket_content, result=formatted_result))
+            current_expr = current_expr[:bracket_match.start()] + ' ' + formatted_result + ' ' + current_expr[bracket_match.end():]
+            reduction_made = True
+            continue
+        # 2. Handle exponents
+        exp_match = re.search(r'(-?\d+(?:\.\d+)?)\s*\^\s*(-?\d+(?:\.\d+)?)', current_expr)
+        if exp_match:
+            base_str, exp_str = exp_match.groups()
+            try:
+                base = float(base_str)
+                exponent = float(exp_str)
+                result = base ** exponent
+                if abs(result) > 1e12 or math.isnan(result) or math.isinf(result):
+                    return None
+                formatted_result = format_number(result)
+            except (OverflowError, ValueError, ZeroDivisionError):
+                return None
+            part = f"{base_str} ^ {exp_str}"
+            steps.append(random.choice(COT_STEP_TEMPLATES["exponents"]).format(part=part, result=formatted_result))
+            current_expr = current_expr[:exp_match.start()] + ' ' + formatted_result + ' ' + current_expr[exp_match.end():]
+            reduction_made = True
+            continue
+        # 3. Handle multiplication, division, modulo (left to right)
+        mdm_match = re.search(r'(-?\d+(?:\.\d+)?)\s*([*/%])\s*(-?\d+(?:\.\d+)?)', current_expr)
+        if mdm_match:
+            left_str, op, right_str = mdm_match.groups()
+            try:
+                left = float(left_str)
+                right = float(right_str)
+                if op == '*':
+                    result = left * right
+                elif op == '/':
+                    if right == 0:
+                        return None
+                    result = left / right
+                elif op == '%':
+                    if right == 0:
+                        return None
+                    result = left % right
+                if abs(result) > 1e12 or math.isnan(result) or math.isinf(result):
+                    return None
+                formatted_result = format_number(result)
+            except (OverflowError, ValueError, ZeroDivisionError):
+                return None
+            part = f"{left_str} {op} {right_str}"
+            steps.append(random.choice(COT_STEP_TEMPLATES["multi_div_mod"]).format(part=part, result=formatted_result))
+            current_expr = current_expr[:mdm_match.start()] + ' ' + formatted_result + ' ' + current_expr[mdm_match.end():]
+            reduction_made = True
+            continue
+        # 4. Handle addition and subtraction (left to right)
+        # Match pattern where we have number [+|-] number but not at start of negative number
+        as_match = re.search(r'(-?\d+(?:\.\d+)?)\s*([+\-])\s*(-?\d+(?:\.\d+)?)', current_expr)
+        if as_match:
+            left_str, op, right_str = as_match.groups()
+            try:
+                left = float(left_str)
+                right = float(right_str)
+                if op == '+':
+                    result = left + right
+                elif op == '-':
+                    result = left - right
+                if abs(result) > 1e12 or math.isnan(result) or math.isinf(result):
+                    return None
+                formatted_result = format_number(result)
+            except (OverflowError, ValueError):
+                return None
+            part = f"{left_str} {op} {right_str}"
+            steps.append(random.choice(COT_STEP_TEMPLATES["add_sub"]).format(part=part, result=formatted_result))
+            current_expr = current_expr[:as_match.start()] + ' ' + formatted_result + ' ' + current_expr[as_match.end():]
+            reduction_made = True
+            continue
+        # If no reduction was made, we're stuck - return None
+        if not reduction_made:
+            return None
+    # Timeout reached
+    return None
+def generate_training_example(_=None):
+    """Generate a single training example with retry logic."""
+    max_retries = 50  # Reduced from 100 for faster generation
+    for attempt in range(max_retries):
+        try:
+            expression_parts = generate_expression_parts()
+            expression_str = " ".join(map(str, expression_parts))
+            cot_result = solve_with_cot(expression_str)
+            if cot_result and isinstance(cot_result['result'], (int, float)):
+                final_result = cot_result['result']
+                # Filter out extreme values
+                if abs(final_result) > 1e12 or (final_result != 0 and abs(final_result) < 1e-4):
+                    continue
+                if math.isnan(final_result) or math.isinf(final_result):
+                    continue
+                result_str = format_number(final_result)
+                if len(result_str) > 20:
+                    continue
+                use_words = random.random() < WORD_FORM_CHANCE
+                if use_words:
+                    expression_text = ' '.join([number_to_words(p) if isinstance(p, int) else operator_to_word(p) if isinstance(p, str) else str(p) for p in expression_parts])
+                    result_text = number_to_words(int(round(final_result)))
+                    completion = random.choice(SIMPLE_COMPLETION_TEMPLATES).format(expression=expression_text, result=result_text)
+                else:
+                    expression_text = expression_str
+                    result_text = result_str
+                    use_reasoning = random.random() < REASONING_CHANCE
+                    if use_reasoning:
+                        intro = random.choice(COT_INTRO_TEMPLATES).format(expression=expression_text)
+                        steps_text = " ".join(cot_result['steps'])
+                        finalizer = random.choice(COT_FINALIZER_TEMPLATES).format(result=result_text)
+                        completion = f"{intro} {steps_text} {finalizer} </think>"
+                    else:
+                        completion = random.choice(SIMPLE_COMPLETION_TEMPLATES).format(expression=expression_text, result=result_text)
+                if random.random() < SENTENCE_FORM_CHANCE:
+                    prompt = random.choice(PROMPT_TEMPLATES).format(expression=expression_text)
+                else:
+                    prompt = f"{expression_text} ="
+                # Clean up spacing
+                prompt = re.sub(r'\s*\(', ' (', prompt)
+                prompt = re.sub(r'\)\s*', ') ', prompt).strip()
+                prompt = re.sub(r'\s+', ' ', prompt)
+                completion = re.sub(r'\s*\(', ' (', completion)
+                completion = re.sub(r'\)\s*', ') ', completion).strip()
+                completion = re.sub(r'\s+', ' ', completion)
+                return {"prompt": prompt, "completion": " " + completion}
+        except Exception as e:
+            continue
+    return None
+def main():
+    print(f"🔥 Generating {NUM_LINES:,} examples using {NUM_WORKERS} parallel workers...")
+    print(f"   Appending to '{OUTPUT_FILE}'...")
+    start_time = time.time()
+    generated_count = 0
+    failed_count = 0
+    with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
+        with multiprocessing.Pool(processes=NUM_WORKERS) as pool:
+            results_iterator = pool.imap_unordered(generate_training_example, range(NUM_LINES), chunksize=100)
+            for item in tqdm(results_iterator, total=NUM_LINES, desc="Generating examples"):
+                if item:
+                    f.write(f"<user> {item['prompt']} <ai>{item['completion']}\n")
+                    generated_count += 1
+                else:
+                    failed_count += 1
+    elapsed_time = time.time() - start_time
+    print(f"\n\n✅ Done! Appended {generated_count:,} new items to '{OUTPUT_FILE}' in {elapsed_time:.2f}s.")
+    print(f"   📊 Success rate: {generated_count}/{NUM_LINES} ({100*generated_count/NUM_LINES:.1f}%)")
+    if failed_count > 0:
+        print(f"   ⚠️ {failed_count:,} generation attempts failed (expressions too complex or invalid)")
+if __name__ == "__main__":
+    multiprocessing.freeze_support()
+    main()

data/math_data.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

finetune_gclm_base.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+import tiktoken
+import contextlib
+# Hyperparameters (must match train_gclm_base.py)
+D_MODEL = 256
+N_LAYERS = 4
+MAX_SEQ_LEN = 1024
+LOCAL_KERNEL_SIZE = 5
+GLOBAL_KERNEL_SIZE = 256
+USE_GLOBAL_EVERY_N_LAYERS = 2
+FFT_SIZE = 1024
+TOKENIZER_NAME = "gpt2"
+# Paths
+VOCAB_MAP_PATH = "vocab_map.pt"
+BASE_MODEL_PATH = "crimson_base_8.9M.pt"
+DATA_DIR = "data"
+CHAT_FILES = ["chat_data.txt", "chat_data2.txt"]
+# Fine-tuning Hyperparameters
+EPOCHS = 10
+BATCH_SIZE = 2
+GRAD_ACCUM_STEPS = 4
+LEARNING_RATE = 3e-4
+USE_AMP = True
+# --- Model Components (Duplicated from train_gclm_base.py for standalone use) ---
+class GlobalConv1D(nn.Module):
+    def __init__(self, d_model, kernel_size, fft_size):
+        super().__init__()
+        self.kernel = nn.Parameter(torch.randn(d_model, kernel_size) * 0.01)
+        self.kernel_size = kernel_size
+        self.fft_size = fft_size
+    def forward(self, x):
+        B, C, T = x.shape
+        K = min(self.kernel_size, T)
+        overlap = K - 1
+        block = self.fft_size - overlap
+        x = F.pad(x, (overlap, 0))
+        k = self.kernel[:, :K]
+        k = F.pad(k, (0, self.fft_size - K))
+        k_f = torch.fft.rfft(k, n=self.fft_size)
+        outs = []
+        pos = 0
+        while pos < T:
+            seg = x[..., pos:pos+self.fft_size]
+            if seg.shape[-1] < self.fft_size:
+                seg = F.pad(seg, (0, self.fft_size - seg.shape[-1]))
+            y = torch.fft.irfft(torch.fft.rfft(seg, n=self.fft_size) * k_f.unsqueeze(0), n=self.fft_size)
+            outs.append(y[..., overlap:overlap+block])
+            pos += block
+        return torch.cat(outs, dim=-1)[..., :T]
+class LocalConv1D(nn.Module):
+    def __init__(self, d_model, k):
+        super().__init__()
+        self.k = k
+        self.dw = nn.Conv1d(d_model, d_model, k, groups=d_model)
+        self.pw = nn.Conv1d(d_model, d_model, 1)
+    def forward(self, x):
+        x = F.pad(x, (self.k - 1, 0))
+        return self.pw(F.relu(self.dw(x)))
+class Block(nn.Module):
+    def __init__(self, d_model, use_global):
+        super().__init__()
+        self.use_global = use_global
+        self.ln1 = nn.LayerNorm(d_model)
+        self.local = LocalConv1D(d_model, LOCAL_KERNEL_SIZE)
+        if use_global:
+            self.ln2 = nn.LayerNorm(d_model)
+            self.global_conv = GlobalConv1D(d_model, GLOBAL_KERNEL_SIZE, FFT_SIZE)
+        self.ln3 = nn.LayerNorm(d_model)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, d_model*4),
+            nn.GELU(),
+            nn.Linear(d_model*4, d_model)
+        )
+    def forward(self, x):
+        x = x + self.local(self.ln1(x).transpose(1,2)).transpose(1,2)
+        if self.use_global:
+            x = x + self.global_conv(self.ln2(x).transpose(1,2)).transpose(1,2)
+        return x + self.ff(self.ln3(x))
+class CrimsonBase(nn.Module):
+    def __init__(self, vocab):
+        super().__init__()
+        self.emb = nn.Embedding(vocab, D_MODEL)
+        self.pos = nn.Embedding(MAX_SEQ_LEN, D_MODEL)
+        self.layers = nn.ModuleList([
+            Block(D_MODEL, i % USE_GLOBAL_EVERY_N_LAYERS == 0)
+            for i in range(N_LAYERS)
+        ])
+        self.ln = nn.LayerNorm(D_MODEL)
+        self.head = nn.Linear(D_MODEL, vocab)
+        self.head.weight = self.emb.weight
+    def forward(self, x):
+        T = x.size(1)
+        h = self.emb(x) + self.pos(torch.arange(T, device=x.device))
+        for layer in self.layers:
+            h = layer(h)
+        return self.head(self.ln(h))
+# --- Dataset for SFT ---
+class SFTDataset(Dataset):
+    def __init__(self, conversations, id2new, max_len, eos_id, pad_id):
+        self.samples = []
+        for conv in conversations:
+            ids = [id2new.get(tok, pad_id) for tok in conv] + [eos_id]
+            if len(ids) > max_len:
+                ids = ids[:max_len]
+            self.samples.append(ids)
+        self.max_len = max_len
+        self.pad_id = pad_id
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        ids = self.samples[idx]
+        x = ids[:-1]
+        y = ids[1:]
+        # Padding
+        padding_len = (self.max_len - 1) - len(x)
+        if padding_len > 0:
+            x = x + [self.pad_id] * padding_len
+            y = y + [self.pad_id] * padding_len
+        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)
+def format_params(num):
+    if num >= 1_000_000_000:
+        return f"{num/1_000_000_000:.1f}B"
+    elif num >= 1_000_000:
+        return f"{num/1_000_000:.1f}M"
+    else:
+        return f"{num/1_000:.1f}K"
+def finetune():
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    print(f"[INFO] Using device: {device}")
+    # Load vocab mapping
+    if not os.path.exists(VOCAB_MAP_PATH):
+        print(f"[ERROR] {VOCAB_MAP_PATH} not found. Run train_gclm_base.py first.")
+        return
+    vocab_data = torch.load(VOCAB_MAP_PATH, map_location="cpu")
+    id2new = vocab_data["id2new"]
+    PAD_ID = vocab_data["PAD_ID"]
+    EOS_ID = vocab_data["EOS_ID"]
+    vocab_size = len(vocab_data["used_tokens"]) + 3 # + PAD, SEP, EOS
+    tok = tiktoken.get_encoding(TOKENIZER_NAME)
+    # Load chat data
+    conversations = []
+    print("[INFO] Loading chat data...")
+    for fname in CHAT_FILES:
+        fpath = os.path.join(DATA_DIR, fname)
+        if os.path.exists(fpath):
+            with open(fpath, "r", encoding="utf-8") as f:
+                for line in f:
+                    if line.strip():
+                        conversations.append(tok.encode(line.strip()))
+        else:
+            print(f"[WARN] {fpath} not found.")
+    if not conversations:
+        print("[ERROR] No chat data found.")
+        return
+    # Dataset & DataLoader
+    dataset = SFTDataset(conversations, id2new, MAX_SEQ_LEN, EOS_ID, PAD_ID)
+    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
+    # Model
+    model = CrimsonBase(vocab_size).to(device)
+    num_params = sum(p.numel() for p in model.parameters())
+    param_str = format_params(num_params)
+    save_path = f"crimson_instruct_{param_str}.pt"
+    print(f"[INFO] Model Parameters: {num_params:,} ({param_str})")
+    if os.path.exists(save_path):
+        model.load_state_dict(torch.load(save_path, map_location=device))
+        print(f"[RESUME] Loaded existing instruct model from {save_path}")
+    elif os.path.exists(BASE_MODEL_PATH):
+        model.load_state_dict(torch.load(BASE_MODEL_PATH, map_location=device))
+        print(f"[START] Loaded base model from {BASE_MODEL_PATH}")
+    else:
+        print(f"[WARN] No checkpoint found. Starting from scratch.")
+    print(f"[INFO] Save path: {save_path}")
+    # Training setup
+    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
+    if device == "cuda" and USE_AMP:
+        ctx = torch.amp.autocast(device)
+        scaler = torch.amp.GradScaler(device)
+    else:
+        ctx = contextlib.nullcontext()
+        scaler = None
+    print(f"[SFT] Starting Supervised Fine-Tuning for {EPOCHS} epochs...")
+    model.train()
+    for ep in range(EPOCHS):
+        pbar = tqdm(loader, desc=f"Epoch {ep+1}/{EPOCHS}")
+        total_loss = 0
+        opt.zero_grad(set_to_none=True)
+        for i, (x, y) in enumerate(pbar):
+            x, y = x.to(device), y.to(device)
+            with ctx:
+                logits = model(x)
+                loss = loss_fn(logits.reshape(-1, vocab_size), y.reshape(-1))
+                loss = loss / GRAD_ACCUM_STEPS
+            if scaler:
+                scaler.scale(loss).backward()
+            else:
+                loss.backward()
+            if (i + 1) % GRAD_ACCUM_STEPS == 0:
+                if scaler:
+                    scaler.step(opt)
+                    scaler.update()
+                else:
+                    opt.step()
+                opt.zero_grad(set_to_none=True)
+            total_loss += loss.item() * GRAD_ACCUM_STEPS
+            pbar.set_postfix(loss=f"{total_loss / (i+1):.4f}")
+        # Save checkpoint after each epoch
+        torch.save(model.state_dict(), save_path)
+        print(f"[OK] Saved {save_path}")
+    print("[DONE] Fine-tuning complete.")
+if __name__ == "__main__":
+    finetune()

sample.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tiktoken
+MODEL_PATH = "crimson_base_3.3M.pt"
+VOCAB_PATH = "vocab_map.pt"
+TOKENIZER_NAME = "gpt2"
+D_MODEL = 256
+N_LAYERS = 4
+MAX_SEQ_LEN = 1024
+LOCAL_KERNEL_SIZE = 5
+GLOBAL_KERNEL_SIZE = 256
+USE_GLOBAL_EVERY_N_LAYERS = 2
+FFT_SIZE = 1024
+PAD_ID = 0
+SEP_ID = 1
+EOS_ID = 2
+OFFSET = 3
+class GlobalConv1D(nn.Module):
+    def __init__(self, d_model, kernel_size, fft_size):
+        super().__init__()
+        self.kernel = nn.Parameter(torch.randn(d_model, kernel_size) * 0.01)
+        self.kernel_size = kernel_size
+        self.fft_size = fft_size
+    def forward(self, x):
+        B, C, T = x.shape
+        K = min(self.kernel_size, T)
+        overlap = K - 1
+        block = self.fft_size - overlap
+        x = F.pad(x, (overlap, 0))
+        k = self.kernel[:, :K]
+        k = F.pad(k, (0, self.fft_size - K))
+        k_f = torch.fft.rfft(k, n=self.fft_size)
+        outs = []
+        pos = 0
+        while pos < T:
+            seg = x[..., pos:pos+self.fft_size]
+            if seg.shape[-1] < self.fft_size:
+                seg = F.pad(seg, (0, self.fft_size - seg.shape[-1]))
+            y = torch.fft.irfft(
+                torch.fft.rfft(seg, n=self.fft_size) * k_f.unsqueeze(0),
+                n=self.fft_size
+            )
+            outs.append(y[..., overlap:overlap+block])
+            pos += block
+        return torch.cat(outs, dim=-1)[..., :T]
+class LocalConv1D(nn.Module):
+    def __init__(self, d_model, k):
+        super().__init__()
+        self.k = k
+        self.dw = nn.Conv1d(d_model, d_model, k, groups=d_model)
+        self.pw = nn.Conv1d(d_model, d_model, 1)
+    def forward(self, x):
+        x = F.pad(x, (self.k - 1, 0))
+        return self.pw(F.relu(self.dw(x)))
+class Block(nn.Module):
+    def __init__(self, d_model, use_global):
+        super().__init__()
+        self.use_global = use_global
+        self.ln1 = nn.LayerNorm(d_model)
+        self.local = LocalConv1D(d_model, LOCAL_KERNEL_SIZE)
+        if use_global:
+            self.ln2 = nn.LayerNorm(d_model)
+            self.global_conv = GlobalConv1D(d_model, GLOBAL_KERNEL_SIZE, FFT_SIZE)
+        self.ln3 = nn.LayerNorm(d_model)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, d_model*4),
+            nn.GELU(),
+            nn.Linear(d_model*4, d_model)
+        )
+    def forward(self, x):
+        x = x + self.local(self.ln1(x).transpose(1,2)).transpose(1,2)
+        if self.use_global:
+            x = x + self.global_conv(self.ln2(x).transpose(1,2)).transpose(1,2)
+        return x + self.ff(self.ln3(x))
+class CrimsonBase(nn.Module):
+    def __init__(self, vocab):
+        super().__init__()
+        self.emb = nn.Embedding(vocab, D_MODEL)
+        self.pos = nn.Embedding(MAX_SEQ_LEN, D_MODEL)
+        self.layers = nn.ModuleList([
+            Block(D_MODEL, i % USE_GLOBAL_EVERY_N_LAYERS == 0)
+            for i in range(N_LAYERS)
+        ])
+        self.ln = nn.LayerNorm(D_MODEL)
+        self.head = nn.Linear(D_MODEL, vocab)
+        self.head.weight = self.emb.weight
+    def forward(self, x):
+        T = x.size(1)
+        h = self.emb(x) + self.pos(torch.arange(T, device=x.device))
+        for layer in self.layers:
+            h = layer(h)
+        return self.head(self.ln(h))
+def load_model_and_vocab(device):
+    if not os.path.exists(VOCAB_PATH):
+        print(f"[ERROR] Vocab file not found: {VOCAB_PATH}")
+        return None, None, None
+    vocab_data = torch.load(VOCAB_PATH, map_location="cpu")
+    used_tokens = vocab_data["used_tokens"]
+    id2new = vocab_data["id2new"]
+    vocab_size = len(used_tokens) + OFFSET
+    print(f"[INFO] Vocab loaded. Size: {vocab_size}")
+    model = CrimsonBase(vocab_size).to(device)
+    if os.path.exists(MODEL_PATH):
+        print(f"[INFO] Loading model from {MODEL_PATH}...")
+        state_dict = torch.load(MODEL_PATH, map_location=device)
+        model.load_state_dict(state_dict)
+        model.eval()
+    else:
+        print(f"[ERROR] Model file not found: {MODEL_PATH}")
+        return None, None, None
+    return model, used_tokens, id2new
+@torch.no_grad()
+def generate(model, prompt, tokenizer, id2new, used_tokens, device, max_new_tokens=200, temperature=0.8, top_k=50):
+    model.eval()
+    raw_ids = tokenizer.encode(prompt)
+    input_ids = []
+    for rid in raw_ids:
+        if rid in id2new:
+            input_ids.append(id2new[rid])
+        else:
+            continue
+    if not input_ids:
+        print("[WARN] No known tokens in prompt.")
+        input_ids = [PAD_ID]
+    x = torch.tensor([input_ids], dtype=torch.long, device=device)
+    generated = []
+    for _ in range(max_new_tokens):
+        if x.size(1) > MAX_SEQ_LEN:
+             ctx = x[:, -MAX_SEQ_LEN:]
+        else:
+            ctx = x
+        logits = model(ctx)
+        next_token_logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            v, _ = torch.topk(next_token_logits, min(top_k, next_token_logits.size(-1)))
+            next_token_logits[next_token_logits < v[:, [-1]]] = -float('Inf')
+        probs = F.softmax(next_token_logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        idx = next_token.item()
+        if idx == EOS_ID:
+            break
+        x = torch.cat((x, next_token), dim=1)
+        generated.append(idx)
+    decoded_text = decoder(generated, used_tokens, tokenizer)
+    return decoded_text
+def decoder(ids, used_tokens, tokenizer):
+    raw_ids = []
+    for i in ids:
+        if i >= OFFSET:
+            raw_ids.append(used_tokens[i - OFFSET])
+    return tokenizer.decode(raw_ids)
+if __name__ == "__main__":
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    print(f"Using device: {device}")
+    model, used_tokens, id2new = load_model_and_vocab(device)
+    enc = tiktoken.get_encoding(TOKENIZER_NAME)
+    if model:
+        newline_id = id2new.get(enc.encode("\n")[0], OFFSET)
+        while True:
+            print(f"\n--- Generating Sample (Temp=0.8, TopK=50) ---")
+            print("-" * 20)
+            x = torch.tensor([[newline_id]], dtype=torch.long, device=device)
+            generated = []
+            with torch.no_grad():
+                for _ in range(900):
+                    if x.size(1) > MAX_SEQ_LEN:
+                         ctx = x[:, -MAX_SEQ_LEN:]
+                    else:
+                        ctx = x
+                    logits = model(ctx)
+                    logits = logits[:, -1, :] / 0.8
+                    v, _ = torch.topk(logits, min(50, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                    probs = F.softmax(logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1)
+                    idx = next_token.item()
+                    x = torch.cat((x, next_token), dim=1)
+                    generated.append(idx)
+                    if idx == EOS_ID:
+                        print("[EOS]", end="", flush=True)
+                        break
+                    if idx >= OFFSET:
+                        raw_id = used_tokens[idx - OFFSET]
+                        token_text = enc.decode([raw_id])
+                        print(token_text, end="", flush=True)
+                    elif idx == PAD_ID:
+                        print("[PAD]", end="", flush=True)
+                    elif idx == SEP_ID:
+                        print("[SEP]", end="", flush=True)
+            print("\n" + "-"*20)
+            cont = input("\nPress [Enter] to generate again, or type 'exit': ")
+            if cont.lower() == 'exit':
+                break

train_gclm_base.py ADDED Viewed

	@@ -0,0 +1,325 @@

+print("Starting...")
+DATA_DIR = "data"
+DATA_PCT = 0.005
+TOKENIZER_NAME = "gpt2"
+REDUCE_VOCAB = False
+VOCAB_SAVE_PATH = "vocab_map.pt"
+EPOCHS = 25
+MICRO_BATCH_SIZE = 1
+GRAD_ACCUM_STEPS = 8
+LEARNING_RATE = 3e-4
+D_MODEL = 256
+N_LAYERS = 4
+MAX_SEQ_LEN = 1024
+LOCAL_KERNEL_SIZE = 5
+GLOBAL_KERNEL_SIZE = 256
+USE_GLOBAL_EVERY_N_LAYERS = 2
+FFT_SIZE = 1024
+SAVE_PATH = "model.pt"
+SAVE_N_EPOCHS = 1
+USE_DEVICE = "cuda"
+USE_AMP = True
+USE_ACTIVATION_CHECKPOINTING = False
+COMPILE = False
+COMPILE_MODE = "reduce-overhead"
+COMPILE_BACKEND = "eager"
+import os
+if os.name != "nt":
+    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+import tiktoken
+if torch.cuda.is_available():
+    torch.set_float32_matmul_precision("high")
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+PAD_ID = 0
+SEP_ID = 1
+EOS_ID = 2
+OFFSET = 3
+def build_dataset_vocab(data_dir, tokenizer, save_path):
+    all_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".txt")]
+    print(f"[INFO] Building vocab from {len(all_files)} files...")
+    combined_used = set()
+    for fpath in all_files:
+        text = open(fpath, "r", encoding="utf-8").read()
+        token_ids = tokenizer.encode(text)
+        combined_used.update(token_ids)
+    used = sorted(list(combined_used))
+    id2new = {tok: i + OFFSET for i, tok in enumerate(used)}
+    torch.save({
+        "used_tokens": used,
+        "id2new": id2new,
+        "PAD_ID": PAD_ID,
+        "SEP_ID": SEP_ID,
+        "EOS_ID": EOS_ID,
+    }, save_path)
+    print(f"[OK] Total Vocab size: {len(used) + OFFSET}")
+    return used, id2new
+class RemappedTextDataset(Dataset):
+    def __init__(self, ids, max_len):
+        self.ids = ids
+        self.max_len = max_len
+    def __len__(self):
+        return max(0, len(self.ids) - self.max_len - 1)
+    def __getitem__(self, i):
+        x = self.ids[i:i+self.max_len]
+        y = self.ids[i+1:i+self.max_len+1]
+        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)
+class GlobalConv1D(nn.Module):
+    def __init__(self, d_model, kernel_size, fft_size):
+        super().__init__()
+        self.kernel = nn.Parameter(torch.randn(d_model, kernel_size) * 0.01)
+        self.kernel_size = kernel_size
+        self.fft_size = fft_size
+    def forward(self, x):
+        B, C, T = x.shape
+        K = min(self.kernel_size, T)
+        overlap = K - 1
+        block = self.fft_size - overlap
+        x = F.pad(x, (overlap, 0))
+        k = self.kernel[:, :K]
+        k = F.pad(k, (0, self.fft_size - K))
+        k_f = torch.fft.rfft(k, n=self.fft_size)
+        outs = []
+        pos = 0
+        while pos < T:
+            seg = x[..., pos:pos+self.fft_size]
+            if seg.shape[-1] < self.fft_size:
+                seg = F.pad(seg, (0, self.fft_size - seg.shape[-1]))
+            y = torch.fft.irfft(
+                torch.fft.rfft(seg, n=self.fft_size) * k_f.unsqueeze(0),
+                n=self.fft_size
+            )
+            outs.append(y[..., overlap:overlap+block])
+            pos += block
+        return torch.cat(outs, dim=-1)[..., :T]
+class LocalConv1D(nn.Module):
+    def __init__(self, d_model, k):
+        super().__init__()
+        self.k = k
+        self.dw = nn.Conv1d(d_model, d_model, k, groups=d_model)
+        self.pw = nn.Conv1d(d_model, d_model, 1)
+    def forward(self, x):
+        x = F.pad(x, (self.k - 1, 0))
+        return self.pw(F.relu(self.dw(x)))
+class Block(nn.Module):
+    def __init__(self, d_model, use_global):
+        super().__init__()
+        self.use_global = use_global
+        self.ln1 = nn.LayerNorm(d_model)
+        self.local = LocalConv1D(d_model, LOCAL_KERNEL_SIZE)
+        if use_global:
+            self.ln2 = nn.LayerNorm(d_model)
+            self.global_conv = GlobalConv1D(d_model, GLOBAL_KERNEL_SIZE, FFT_SIZE)
+        self.ln3 = nn.LayerNorm(d_model)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, d_model*4),
+            nn.GELU(),
+            nn.Linear(d_model*4, d_model)
+        )
+    def forward(self, x):
+        x = x + self.local(self.ln1(x).transpose(1,2)).transpose(1,2)
+        if self.use_global:
+            x = x + self.global_conv(self.ln2(x).transpose(1,2)).transpose(1,2)
+        return x + self.ff(self.ln3(x))
+class CrimsonBase(nn.Module):
+    def __init__(self, vocab):
+        super().__init__()
+        self.emb = nn.Embedding(vocab, D_MODEL)
+        self.pos = nn.Embedding(MAX_SEQ_LEN, D_MODEL)
+        self.layers = nn.ModuleList([
+            Block(D_MODEL, i % USE_GLOBAL_EVERY_N_LAYERS == 0)
+            for i in range(N_LAYERS)
+        ])
+        self.ln = nn.LayerNorm(D_MODEL)
+        self.head = nn.Linear(D_MODEL, vocab)
+        self.head.weight = self.emb.weight
+    def forward(self, x):
+        T = x.size(1)
+        h = self.emb(x) + self.pos(torch.arange(T, device=x.device))
+        for layer in self.layers:
+            h = layer(h)
+        return self.head(self.ln(h))
+def format_params(num):
+    if num >= 1_000_000_000:
+        return f"{num/1_000_000_000:.1f}B"
+    elif num >= 1_000_000:
+        return f"{num/1_000_000:.1f}M"
+    else:
+        return f"{num/1_000:.1f}K"
+@torch.no_grad()
+def estimate_loss(model, dl, device, ctx):
+    model.eval()
+    losses = []
+    limit = 50
+    for i, (x, y) in enumerate(dl):
+        if i >= limit: break
+        x, y = x.to(device), y.to(device)
+        with ctx:
+            logits = model(x)
+            loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), y.reshape(-1), ignore_index=PAD_ID)
+        losses.append(loss.item())
+    model.train()
+    return sum(losses) / len(losses) if losses else 0.0
+def train():
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    print("[INFO] Device:", device)
+    tok = tiktoken.get_encoding(TOKENIZER_NAME)
+    used, id2new = build_dataset_vocab(DATA_DIR, tok, VOCAB_SAVE_PATH)
+    vocab = len(used) + OFFSET
+    print("[INFO] Loading and tokenizing text from all files...")
+    all_files = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if f.endswith(".txt")]
+    full_text = ""
+    for fpath in all_files:
+        full_text += open(fpath, "r", encoding="utf-8").read() + "\n"
+    if DATA_PCT < 1.0:
+        full_text = full_text[:int(len(full_text) * DATA_PCT)]
+    raw_ids = tok.encode(full_text)
+    ids = [id2new.get(i, PAD_ID) for i in raw_ids] + [EOS_ID]
+    n = len(ids)
+    split_idx = int(n * 0.9)
+    train_ids = ids[:split_idx]
+    val_ids = ids[split_idx:]
+    print(f"[INFO] Tokens: {n} | Train: {len(train_ids)} | Val: {len(val_ids)}")
+    train_ds = RemappedTextDataset(train_ids, MAX_SEQ_LEN)
+    val_ds = RemappedTextDataset(val_ids, MAX_SEQ_LEN)
+    train_dl = DataLoader(train_ds, batch_size=MICRO_BATCH_SIZE, shuffle=True)
+    val_dl = DataLoader(val_ds, batch_size=MICRO_BATCH_SIZE, shuffle=False)
+    model = CrimsonBase(vocab).to(device)
+    num_params = sum(p.numel() for p in model.parameters())
+    param_str = format_params(num_params)
+    save_path = f"crimson_base_{param_str}.pt"
+    print(f"[INFO] Model parameters: {num_params:,} ({param_str})")
+    print(f"[INFO] Save path: {save_path}")
+    if os.path.exists(save_path):
+        model.load_state_dict(torch.load(save_path, map_location=device))
+        print(f"[RESUME] Loaded existing checkpoint from {save_path}")
+    if device == "cuda" and COMPILE:
+        print("[INFO] Compiling model with torch.compile...")
+        model = torch.compile(
+            model,
+            mode=COMPILE_MODE,
+            fullgraph=False,
+            backend=COMPILE_BACKEND
+        )
+    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
+    if device == "cuda" and USE_AMP:
+        ctx = torch.amp.autocast(device)
+        scaler = torch.amp.GradScaler(device)
+    else:
+        import contextlib
+        ctx = contextlib.nullcontext()
+        scaler = None
+    for ep in range(EPOCHS):
+        print(f"\nEpoch {ep+1}/{EPOCHS}")
+        opt.zero_grad(set_to_none=True)
+        pbar = tqdm(train_dl, desc="Training")
+        running_loss = 0.0
+        for i, (x, y) in enumerate(pbar):
+            x, y = x.to(device), y.to(device)
+            with ctx:
+                logits = model(x)
+                loss = loss_fn(logits.reshape(-1, vocab), y.reshape(-1))
+                loss_val = loss.item()
+                loss = loss / GRAD_ACCUM_STEPS
+            if scaler:
+                scaler.scale(loss).backward()
+            else:
+                loss.backward()
+            if (i+1) % GRAD_ACCUM_STEPS == 0:
+                if scaler:
+                    scaler.step(opt)
+                    scaler.update()
+                else:
+                    opt.step()
+                opt.zero_grad(set_to_none=True)
+            running_loss = 0.9 * running_loss + 0.1 * loss_val if running_loss > 0 else loss_val
+            pbar.set_postfix(loss=f"{running_loss:.4f}")
+        val_loss = estimate_loss(model, val_dl, device, ctx)
+        print(f"Epoch {ep+1} finished. Train Loss: {running_loss:.4f} | Val Loss: {val_loss:.4f}")
+        if SAVE_N_EPOCHS and (ep+1) % SAVE_N_EPOCHS == 0:
+            torch.save(model.state_dict(), save_path)
+            print(f"[OK] Saved checkpoint to {save_path}")
+    torch.save(model.state_dict(), save_path)
+    print("[DONE] Training complete.")
+if __name__ == "__main__":
+    train()

vocab_map.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3403a73a81a7186b53b28a093fbff744e12a9fcaffe663fd7294b8a0f875a7e
+size 214393