memory-augmented-generation

Sleeping

App Files Files Community

Pavantej commited on Dec 21, 2025

Commit

b6df69a

verified ·

1 Parent(s): dd93f43

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +48 -44

app.py CHANGED Viewed

@@ -24,9 +24,10 @@ print()
 MODEL_NAME = "distilgpt2"
 HIDDEN_DIM = 768  # distilgpt2 hidden dimension
 MEMORY_DIM = 256  # Memory space dimension
-LEARNING_RATE = 1e-3  # Base learning rate for test-time updates
 MAX_NEW_TOKENS = 50  # Max tokens to generate
-MEMORY_ALPHA = 0.1  # Memory influence strength on generation
 # ========== Initialize Components ==========
 print("🧠 Initializing Titans + MIRAS brain...")
@@ -88,53 +89,56 @@ def chat(message, history):
     if seq_len > 1:
         # We have context - train on predicting each next token
-        with torch.enable_grad():
-            total_lm_loss = 0.0
-            # For each position (except last), predict next token
-            for pos in range(seq_len - 1):
-                h_pos = all_hidden[:, pos, :]  # Hidden at position pos
-                # Project to memory space
-                k = key_proj(h_pos)
-                # Query memory and augment hidden state
-                memory_out = memory(k)
-                h_augmented = h_pos + MEMORY_ALPHA * output_proj(memory_out)
-                # Compute logits for next token
-                logits = model.lm_head(h_augmented)  # (1, vocab_size)
-                # Target is the actual next token
-                target = inputs['input_ids'][:, pos + 1]
-                # Cross-entropy loss
-                lm_loss = nn.functional.cross_entropy(logits, target)
-                total_lm_loss = total_lm_loss + lm_loss
-            # Average loss over positions
-            memory_loss = total_lm_loss / (seq_len - 1)
-            # Get retention factor
-            retention = memory.retention_gate(memory_loss)
-            effective_lr = LEARNING_RATE * retention
-            # Backprop and update
-            memory_loss.backward()
-            with torch.no_grad():
-                # Update memory
-                if memory.W.grad is not None:
-                    memory.W -= effective_lr * memory.W.grad
-                    memory.W.grad.zero_()
-                # Update output projection
-                if output_proj.projection.weight.grad is not None:
-                    output_proj.projection.weight -= effective_lr * output_proj.projection.weight.grad
-                    output_proj.projection.weight.grad.zero_()
-                # Update stats
-                memory.update_stats(memory_loss)
     else:
         # Single token - just compute MSE for stats
         with torch.no_grad():

 MODEL_NAME = "distilgpt2"
 HIDDEN_DIM = 768  # distilgpt2 hidden dimension
 MEMORY_DIM = 256  # Memory space dimension
+LEARNING_RATE = 0.01  # Increased learning rate for faster adaptation
 MAX_NEW_TOKENS = 50  # Max tokens to generate
+MEMORY_ALPHA = 1.0  # Increased from 0.1 - stronger memory influence
+NUM_TRAIN_STEPS = 5  # Multiple gradient steps per input for better learning
 # ========== Initialize Components ==========
 print("🧠 Initializing Titans + MIRAS brain...")
     if seq_len > 1:
         # We have context - train on predicting each next token
+        # Run multiple training steps for faster learning
+        for train_step in range(NUM_TRAIN_STEPS):
+            with torch.enable_grad():
+                total_lm_loss = 0.0
+                # For each position (except last), predict next token
+                for pos in range(seq_len - 1):
+                    h_pos = all_hidden[:, pos, :]  # Hidden at position pos
+                    # Project to memory space
+                    k = key_proj(h_pos)
+                    # Query memory and augment hidden state
+                    memory_out = memory(k)
+                    h_augmented = h_pos + MEMORY_ALPHA * output_proj(memory_out)
+                    # Compute logits for next token
+                    logits = model.lm_head(h_augmented)  # (1, vocab_size)
+                    # Target is the actual next token
+                    target = inputs['input_ids'][:, pos + 1]
+                    # Cross-entropy loss
+                    lm_loss = nn.functional.cross_entropy(logits, target)
+                    total_lm_loss = total_lm_loss + lm_loss
+                # Average loss over positions
+                memory_loss = total_lm_loss / (seq_len - 1)
+                # Get retention factor
+                retention = memory.retention_gate(memory_loss)
+                effective_lr = LEARNING_RATE * retention
+                # Backprop and update
+                memory_loss.backward()
+                with torch.no_grad():
+                    # Update memory
+                    if memory.W.grad is not None:
+                        memory.W -= effective_lr * memory.W.grad
+                        memory.W.grad.zero_()
+                    # Update output projection
+                    if output_proj.projection.weight.grad is not None:
+                        output_proj.projection.weight -= effective_lr * output_proj.projection.weight.grad
+                        output_proj.projection.weight.grad.zero_()
+        # Update stats after all training steps (use final loss)
+        with torch.no_grad():
+            memory.update_stats(memory_loss)
     else:
         # Single token - just compute MSE for stats
         with torch.no_grad():