"""
Docker Neural Memory - Production Demo
REAL neural memory implementation using Titans architecture.
Demonstrates Docker-native AI memory with MCP server integration.
Deploy to: https://huggingface.co/spaces
"""
import os
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Tuple
import gradio as gr
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import torch
from huggingface_hub import InferenceClient
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
matplotlib.use("Agg")
# =============================================================================
# CUSTOM CSS FOR POLISHED UI
# =============================================================================
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600&family=Outfit:wght@300;400;500;600;700&display=swap');
:root {
--neural-cyan: #00d4ff;
--neural-cyan-glow: rgba(0, 212, 255, 0.3);
--rag-orange: #ff8c42;
--purple-accent: #a855f7;
--bg-deep: #0a0a1a;
--bg-card: #12122a;
--bg-card-hover: #1a1a3a;
--text-primary: #f8fafc;
--text-secondary: #94a3b8;
--border-subtle: rgba(148, 163, 184, 0.1);
--success-green: #22c55e;
}
/* Global font settings */
.gradio-container {
font-family: 'Outfit', system-ui, -apple-system, sans-serif !important;
background: linear-gradient(180deg, var(--bg-deep) 0%, #0f0f23 100%) !important;
}
/* Headings */
.gradio-container h1, .gradio-container h2, .gradio-container h3, .gradio-container h4 {
font-family: 'Outfit', sans-serif !important;
font-weight: 600 !important;
letter-spacing: -0.02em !important;
}
/* Code and monospace */
.gradio-container code, .gradio-container pre {
font-family: 'JetBrains Mono', monospace !important;
}
/* Tab styling */
.tabs > .tab-nav > button {
font-family: 'Outfit', sans-serif !important;
font-weight: 500 !important;
padding: 12px 24px !important;
border-radius: 8px 8px 0 0 !important;
transition: all 0.3s ease !important;
}
.tabs > .tab-nav > button.selected {
background: linear-gradient(135deg, var(--neural-cyan) 0%, var(--purple-accent) 100%) !important;
color: white !important;
}
/* Button styling */
.gr-button {
font-family: 'Outfit', sans-serif !important;
font-weight: 500 !important;
border-radius: 8px !important;
transition: all 0.3s ease !important;
}
.gr-button-primary {
background: linear-gradient(135deg, var(--neural-cyan) 0%, var(--purple-accent) 100%) !important;
border: none !important;
}
.gr-button-primary:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 25px var(--neural-cyan-glow) !important;
}
.gr-button-secondary {
background: transparent !important;
border: 1px solid var(--text-secondary) !important;
color: var(--text-secondary) !important;
}
.gr-button-secondary:hover {
border-color: var(--neural-cyan) !important;
color: var(--neural-cyan) !important;
}
/* FIX: Labels should NOT look like buttons */
.gr-textbox label, .gr-plot label, .gr-dropdown label, .gr-checkbox label,
label.svelte-1gfkn6j, .label-wrap, span.svelte-1gfkn6j {
background: transparent !important;
border: none !important;
padding: 0 !important;
box-shadow: none !important;
font-weight: 500 !important;
color: var(--text-secondary) !important;
cursor: default !important;
}
/* Ensure label containers don't have button styling */
.gr-form > label, .gr-box > label, div[data-testid="textbox"] > label {
background: none !important;
border: none !important;
box-shadow: none !important;
}
/* Input styling */
.gr-textbox textarea, .gr-textbox input {
font-family: 'Outfit', sans-serif !important;
background: var(--bg-card) !important;
border: 1px solid var(--border-subtle) !important;
border-radius: 8px !important;
transition: all 0.3s ease !important;
}
.gr-textbox textarea:focus, .gr-textbox input:focus {
border-color: var(--neural-cyan) !important;
box-shadow: 0 0 0 3px var(--neural-cyan-glow) !important;
}
/* Card/box styling */
.gr-box, .gr-panel {
background: var(--bg-card) !important;
border: 1px solid var(--border-subtle) !important;
border-radius: 12px !important;
}
/* Plot styling */
.gr-plot {
background: var(--bg-card) !important;
border-radius: 12px !important;
border: 1px solid var(--border-subtle) !important;
}
/* Markdown styling */
.prose {
color: var(--text-primary) !important;
}
.prose h3, .prose h4 {
color: var(--neural-cyan) !important;
}
/* Smooth animations */
* {
transition: background-color 0.2s ease, border-color 0.2s ease, box-shadow 0.2s ease;
}
"""
HEADER_HTML = '''
π§
Docker Neural Memory
Test-Time Training: Evolving LLMs from data hoarders to knowledge creators
β‘ PyTorch TTT
π³ Docker Native
π MCP Server
π Titans Architecture
'''
FOOTER_HTML = '''
Built by
Carlos Crespo Macaya
AI Engineer β GenAI Systems & Applied MLOps
Docker Neural Memory β Containerized AI memory with real test-time training
'''
# =============================================================================
# HUGGINGFACE INFERENCE CLIENT
# =============================================================================
# Use a model that is available on HF Serverless Inference free tier
# See: https://huggingface.co/models?inference_provider=hf-inference&pipeline_tag=text-generation
HF_MODEL = os.getenv("HF_MODEL", "HuggingFaceTB/SmolLM3-3B")
HF_TOKEN = os.getenv("HF_TOKEN", None) # Optional - works without for many models
try:
hf_client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
LLM_AVAILABLE = True
except Exception as e:
print(f"Warning: Could not initialize HF client: {e}")
hf_client = None
LLM_AVAILABLE = False
# Add src to path for real implementation
# When deployed to HF Spaces, src/ is copied to the same directory as app.py
sys.path.insert(0, str(Path(__file__).parent))
from src.config import MemoryConfig # noqa: E402
from src.memory.neural_memory import NeuralMemory # noqa: E402
# =============================================================================
# REAL NEURAL MEMORY INSTANCE
# =============================================================================
# Initialize the REAL neural memory - this is actual PyTorch, not a simulation
memory = NeuralMemory(MemoryConfig(dim=256, learning_rate=0.02))
# Track history for visualization
observation_history: List[Dict] = []
# =============================================================================
# COMPARISON METRICS & KNOWLEDGE BASE
# =============================================================================
@dataclass
class ComparisonMetrics:
"""Track comparison between vanilla and memory-augmented responses."""
# With Neural Memory
nm_queries: int = 0
nm_correct: int = 0
nm_hallucinations: int = 0
nm_response_times: List[float] = field(default_factory=list)
# Vanilla (no memory)
vanilla_queries: int = 0
vanilla_correct: int = 0
vanilla_hallucinations: int = 0
vanilla_response_times: List[float] = field(default_factory=list)
metrics = ComparisonMetrics()
# Knowledge base - facts the user teaches
knowledge_base: List[Dict[str, str]] = []
# Store embeddings for t-SNE visualization
embeddings_store: List[Dict] = []
def get_embedding(text: str) -> np.ndarray:
"""Get the neural memory's internal representation of text."""
with torch.no_grad():
# Convert text to tensor using memory's encoding
tensor = memory._encode_text(text)
# Pass through memory network to get learned representation
output = memory.memory_net(tensor)
# Flatten and ensure fixed size (pad or truncate to 256)
flat = output.cpu().numpy().flatten()
target_size = 256
if len(flat) < target_size:
# Pad with zeros
flat = np.pad(flat, (0, target_size - len(flat)), mode='constant')
elif len(flat) > target_size:
# Truncate
flat = flat[:target_size]
return flat
def create_knowledge_base_visualization() -> plt.Figure:
"""Create visualization of the knowledge base (RAG store)."""
fig, ax = plt.subplots(figsize=(8, 6))
if not knowledge_base:
ax.text(
0.5, 0.5,
"No facts in knowledge base yet.\nAdd facts to see them here.",
ha="center", va="center", fontsize=14, color="gray"
)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis("off")
ax.set_title("Knowledge Base (RAG Store)", fontsize=14, fontweight="bold")
return fig
# Create a visual list of facts
n_facts = len(knowledge_base)
y_positions = np.linspace(0.9, 0.1, min(n_facts, 10))
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis("off")
# Title
ax.set_title(f"Knowledge Base (RAG Store) - {n_facts} Facts", fontsize=14, fontweight="bold")
# Draw facts as cards
for i, (y_pos, item) in enumerate(zip(y_positions, knowledge_base[-10:])):
fact_text = item["fact"]
if len(fact_text) > 60:
fact_text = fact_text[:57] + "..."
# Draw a rounded rectangle
rect = plt.Rectangle((0.02, y_pos - 0.035), 0.96, 0.07,
facecolor="#e8f4f8", edgecolor="#3498db",
linewidth=2, alpha=0.8, zorder=1)
ax.add_patch(rect)
# Add fact number and text
ax.text(0.05, y_pos, f"#{len(knowledge_base) - len(knowledge_base[-10:]) + i + 1}",
fontsize=10, fontweight="bold", color="#2980b9", va="center")
ax.text(0.12, y_pos, fact_text, fontsize=10, va="center", color="#2c3e50")
if n_facts > 10:
ax.text(0.5, 0.02, f"... and {n_facts - 10} more facts",
ha="center", fontsize=9, color="gray", style="italic")
plt.tight_layout()
return fig
def create_neural_memory_state_visualization() -> plt.Figure:
"""Create visualization of the neural memory state."""
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
# 1. Weight distribution histogram
ax1 = axes[0]
with torch.no_grad():
all_weights = []
for param in memory.memory_net.parameters():
all_weights.extend(param.data.cpu().numpy().flatten())
all_weights = np.array(all_weights)
ax1.hist(all_weights, bins=50, color="#3498db", alpha=0.7, edgecolor="white")
ax1.axvline(x=0, color="red", linestyle="--", alpha=0.5)
ax1.set_title("Weight Distribution", fontsize=11, fontweight="bold")
ax1.set_xlabel("Weight Value")
ax1.set_ylabel("Count")
ax1.grid(True, alpha=0.3)
# 2. Weight heatmap (sample)
ax2 = axes[1]
weights = get_weight_sample()
im = ax2.imshow(weights, cmap="RdBu_r", aspect="auto", vmin=-0.5, vmax=0.5)
ax2.set_title("Weight Matrix Sample (16x16)", fontsize=11, fontweight="bold")
ax2.axis("off")
plt.colorbar(im, ax=ax2, label="Value")
# 3. Memory stats
ax3 = axes[2]
ax3.axis("off")
stats = memory.get_stats()
stats_text = f"""
Neural Memory State
βββββββββββββββββββ
Parameters: {stats['weight_parameters']:,}
Dimension: {stats['dimension']}
Learning Rate: {stats['learning_rate']:.4f}
Observations: {stats['total_observations']}
Avg Surprise: {stats['avg_surprise']:.4f}
Weight Stats:
β’ Mean: {np.mean(all_weights):.4f}
β’ Std: {np.std(all_weights):.4f}
β’ Min: {np.min(all_weights):.4f}
β’ Max: {np.max(all_weights):.4f}
"""
ax3.text(0.1, 0.5, stats_text, fontsize=10, family="monospace",
va="center", transform=ax3.transAxes,
bbox={"boxstyle": "round,pad=0.5", "facecolor": "#f0f0f0", "alpha": 0.8})
ax3.set_title("Memory Statistics", fontsize=11, fontweight="bold")
plt.tight_layout()
return fig
def create_tsne_visualization() -> plt.Figure:
"""Create t-SNE visualization of learned representations."""
fig, ax = plt.subplots(figsize=(10, 8))
if len(embeddings_store) < 2:
ax.text(
0.5, 0.5,
"Add at least 2 facts to see the embedding space",
ha="center", va="center", fontsize=14, color="gray"
)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis("off")
return fig
# Extract embeddings and labels
embeddings = np.array([e["embedding"] for e in embeddings_store])
labels = [e["label"][:30] + "..." if len(e["label"]) > 30 else e["label"]
for e in embeddings_store]
surprises = [e["surprise"] for e in embeddings_store]
# Use PCA if few samples, t-SNE otherwise
n_samples = len(embeddings)
if n_samples < 5:
# PCA for small sample sizes
reducer = PCA(n_components=2)
reduced = reducer.fit_transform(embeddings)
method = "PCA"
else:
# t-SNE for larger sample sizes
perplexity = min(30, n_samples - 1)
reducer = TSNE(n_components=2, perplexity=perplexity, random_state=42)
reduced = reducer.fit_transform(embeddings)
method = "t-SNE"
# Plot points
scatter = ax.scatter(
reduced[:, 0], reduced[:, 1],
c=surprises, cmap="RdYlBu_r",
s=150, alpha=0.7, edgecolors="white", linewidth=2
)
# Add labels
for i, label in enumerate(labels):
ax.annotate(
label, (reduced[i, 0], reduced[i, 1]),
xytext=(5, 5), textcoords="offset points",
fontsize=9, alpha=0.8,
bbox={"boxstyle": "round,pad=0.3", "facecolor": "white", "alpha": 0.7}
)
# Colorbar
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label("Surprise (Red=Novel, Blue=Familiar)", fontsize=10)
ax.set_title(f"Neural Memory Embedding Space ({method})\n"
f"{n_samples} observations - Similar concepts cluster together",
fontsize=12, fontweight="bold")
ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")
ax.grid(True, alpha=0.3)
plt.tight_layout()
return fig
def create_embedding_comparison() -> plt.Figure:
"""Create side-by-side: weight heatmap + embedding space."""
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Left: Weight heatmap
ax1 = axes[0]
weights = get_weight_sample()
im = ax1.imshow(weights, cmap="RdBu_r", aspect="auto", vmin=-0.5, vmax=0.5)
ax1.set_title("Neural Network Weights\n(These update during learning)",
fontsize=11, fontweight="bold")
ax1.axis("off")
plt.colorbar(im, ax=ax1, label="Weight Value")
# Right: Embedding space (simplified if few points)
ax2 = axes[1]
if len(embeddings_store) < 2:
ax2.text(0.5, 0.5, "Add facts to see\nembedding space",
ha="center", va="center", fontsize=12, color="gray")
ax2.set_xlim(0, 1)
ax2.set_ylim(0, 1)
else:
embeddings = np.array([e["embedding"] for e in embeddings_store])
surprises = [e["surprise"] for e in embeddings_store]
n_samples = len(embeddings)
if n_samples < 5:
reducer = PCA(n_components=2)
else:
perplexity = min(30, n_samples - 1)
reducer = TSNE(n_components=2, perplexity=perplexity, random_state=42)
reduced = reducer.fit_transform(embeddings)
scatter = ax2.scatter(reduced[:, 0], reduced[:, 1], c=surprises,
cmap="RdYlBu_r", s=100, alpha=0.7)
plt.colorbar(scatter, ax=ax2, label="Surprise")
ax2.grid(True, alpha=0.3)
ax2.set_title("Learned Representations\n(Similar facts cluster together)",
fontsize=11, fontweight="bold")
plt.tight_layout()
return fig
def call_llm(prompt: str, context: str = "") -> Tuple[str, float]:
"""Call HuggingFace LLM. Returns (response, time)."""
if not LLM_AVAILABLE or hf_client is None:
return "[LLM not available - set HF_TOKEN for comparison demo]", 0.0
try:
# Build messages for chat completion
if context:
system_msg = f"""You have access to the following knowledge:
{context}
Based ONLY on the knowledge above, answer questions. If the information is not in the knowledge provided, say "I don't have information about that."
"""
messages = [
{"role": "system", "content": system_msg},
{"role": "user", "content": prompt},
]
else:
messages = [
{"role": "user", "content": prompt},
]
start = time.time()
response = hf_client.chat_completion(
messages=messages,
max_tokens=150,
temperature=0.7,
)
elapsed = time.time() - start
# Extract the response content
answer = response.choices[0].message.content
return answer.strip() if answer else "", elapsed
except Exception as e:
return f"Error: {e!s}", 0.0
def add_to_knowledge_base(fact: str) -> Tuple[str, plt.Figure, plt.Figure, plt.Figure]:
"""Add a fact to the knowledge base and observe it in neural memory."""
if not fact.strip():
return (
"Please enter a fact to add.",
create_tsne_visualization(),
create_neural_memory_state_visualization(),
create_knowledge_base_visualization(),
)
# Add to knowledge base
knowledge_base.append({"fact": fact, "timestamp": time.time()})
# Observe in neural memory
result = memory.observe(fact)
# Store embedding for visualization
embedding = get_embedding(fact)
embeddings_store.append({
"label": fact,
"embedding": embedding,
"surprise": result["surprise"],
"timestamp": time.time(),
})
output = f"""### Fact Added
**Fact:** "{fact}"
**Neural Memory Response:**
| Metric | Value |
|--------|-------|
| Surprise | {result['surprise']:.4f} |
| Weight Delta | {result['weight_delta']:.6f} |
| Learned | {'Yes' if result['learned'] else 'No'} |
**Knowledge Base Size:** {len(knowledge_base)} facts
**Embeddings Stored:** {len(embeddings_store)}
"""
return (
output,
create_tsne_visualization(),
create_neural_memory_state_visualization(),
create_knowledge_base_visualization(),
)
def get_knowledge_context() -> str:
"""Get all facts as context string."""
if not knowledge_base:
return ""
return "\n".join([f"- {item['fact']}" for item in knowledge_base])
def call_rag_llm(question: str, knowledge_base: List[Dict]) -> Tuple[str, float, List[str]]:
"""Simulate RAG: retrieve most similar facts by keyword matching."""
if not LLM_AVAILABLE or hf_client is None:
return "[LLM not available]", 0.0, []
# Simple RAG simulation: keyword-based retrieval (top 2 most relevant)
question_words = set(question.lower().split())
scored_facts = []
for item in knowledge_base:
fact = item["fact"]
fact_words = set(fact.lower().split())
# Simple overlap score
overlap = len(question_words & fact_words)
scored_facts.append((overlap, fact))
# Get top 2 most relevant facts
scored_facts.sort(reverse=True, key=lambda x: x[0])
retrieved = [f for score, f in scored_facts[:2] if score > 0]
if retrieved:
context = "Retrieved facts:\n" + "\n".join([f"- {f}" for f in retrieved])
system_msg = f"""You are a RAG system. You can ONLY use the retrieved facts below to answer.
If the retrieved facts don't directly answer the question, say "The retrieved information doesn't cover this."
{context}
"""
else:
system_msg = "You are a RAG system with no relevant documents retrieved. Say 'No relevant documents found.'"
retrieved = ["(none retrieved)"]
messages = [
{"role": "system", "content": system_msg},
{"role": "user", "content": question},
]
try:
start = time.time()
response = hf_client.chat_completion(messages=messages, max_tokens=150, temperature=0.7)
elapsed = time.time() - start
answer = response.choices[0].message.content
return answer.strip() if answer else "", elapsed, retrieved
except Exception as e:
return f"Error: {e!s}", 0.0, retrieved
def call_neural_memory_llm(question: str, knowledge_base: List[Dict], surprise: float) -> Tuple[str, float]:
"""Neural Memory augmented LLM: uses ALL facts + learned patterns."""
if not LLM_AVAILABLE or hf_client is None:
return "[LLM not available]", 0.0
# Neural memory provides ALL context + pattern awareness
all_facts = "\n".join([f"- {item['fact']}" for item in knowledge_base])
# Analyze patterns in the facts
patterns_hint = ""
if knowledge_base:
# Look for approval/rejection patterns
approvals = [f["fact"] for f in knowledge_base if "approved" in f["fact"].lower() or "liked" in f["fact"].lower()]
rejections = [f["fact"] for f in knowledge_base if "rejected" in f["fact"].lower() or "disliked" in f["fact"].lower()]
if approvals or rejections:
patterns_hint = "\n\nLearned patterns from observations:"
if approvals:
patterns_hint += f"\n- Positive signals: {len(approvals)} approvals/likes"
if rejections:
patterns_hint += f"\n- Negative signals: {len(rejections)} rejections/dislikes"
patterns_hint += "\n- Look for common themes in approved vs rejected items"
system_msg = f"""You are an AI with neural memory that has LEARNED from all observations below.
Unlike simple retrieval, you should:
1. Consider ALL facts holistically
2. Identify PATTERNS across multiple observations
3. Make INFERENCES based on learned patterns
4. Predict based on trends, not just direct matches
Observations (learned knowledge):
{all_facts}
{patterns_hint}
Question novelty (surprise score): {surprise:.2f}
- Low surprise (<0.3): This topic is familiar from your observations
- High surprise (>0.6): This is a novel topic, be cautious
"""
messages = [
{"role": "system", "content": system_msg},
{"role": "user", "content": question},
]
try:
start = time.time()
response = hf_client.chat_completion(messages=messages, max_tokens=200, temperature=0.7)
elapsed = time.time() - start
answer = response.choices[0].message.content
return answer.strip() if answer else "", elapsed
except Exception as e:
return f"Error: {e!s}", 0.0
def compare_responses(question: str) -> Tuple[str, str, str, plt.Figure, plt.Figure]:
"""Compare RAG vs Neural Memory augmented LLM on the same question."""
global metrics
if not question.strip():
return "", "", "", create_neural_memory_state_visualization(), create_knowledge_base_visualization()
if not LLM_AVAILABLE:
return (
"LLM not available. Please set HF_TOKEN environment variable.",
"LLM not available.",
"Comparison requires LLM access.",
create_neural_memory_state_visualization(),
create_knowledge_base_visualization(),
)
# Check surprise (is this question familiar?)
surprise = memory.surprise(question)
# Query with NEURAL MEMORY (pattern learning, all context)
nm_response, nm_time = call_neural_memory_llm(question, knowledge_base, surprise)
metrics.nm_queries += 1
metrics.nm_response_times.append(nm_time)
# Query with RAG (simple retrieval)
rag_response, rag_time, retrieved_facts = call_rag_llm(question, knowledge_base)
metrics.vanilla_queries += 1
metrics.vanilla_response_times.append(rag_time)
# Simple quality detection
rag_failed = any(
phrase in rag_response.lower()
for phrase in ["doesn't cover", "no relevant", "don't have", "cannot answer"]
)
nm_confident = not any(
phrase in nm_response.lower()
for phrase in ["i don't know", "i don't have", "cannot"]
)
if rag_failed:
metrics.vanilla_hallucinations += 1
if nm_confident and knowledge_base:
metrics.nm_correct += 1
# Format outputs - Neural Memory
nm_output = f"""### Neural Memory (Pattern Learning)
**Question:** {question}
**Response:**
> {nm_response}
---
**How it works:**
- Uses **ALL {len(knowledge_base)} facts** holistically
- **Learns patterns** (e.g., approval vs rejection trends)
- **Surprise Score: {surprise:.3f}** - {'familiar topic' if surprise < 0.4 else 'novel topic'}
- Response Time: {nm_time:.2f}s
"""
# Format outputs - RAG
retrieved_str = "\n".join([f" - {f}" for f in retrieved_facts])
rag_output = f"""### RAG (Retrieval Only)
**Question:** {question}
**Response:**
> {rag_response}
---
**How it works:**
- Retrieved **{len([f for f in retrieved_facts if f != '(none retrieved)'])} facts** by keyword match:
{retrieved_str}
- **No pattern learning** - just similarity search
- Response Time: {rag_time:.2f}s
"""
# Comparison summary
comparison = get_comparison_summary()
return (
nm_output,
rag_output,
comparison,
create_neural_memory_state_visualization(),
create_knowledge_base_visualization(),
)
def get_comparison_summary() -> str:
"""Generate comparison metrics summary."""
nm_avg_time = (
sum(metrics.nm_response_times) / len(metrics.nm_response_times)
if metrics.nm_response_times
else 0
)
rag_avg_time = (
sum(metrics.vanilla_response_times) / len(metrics.vanilla_response_times)
if metrics.vanilla_response_times
else 0
)
nm_accuracy = (
metrics.nm_correct / metrics.nm_queries * 100 if metrics.nm_queries else 0
)
rag_fail_rate = (
metrics.vanilla_hallucinations / metrics.vanilla_queries * 100
if metrics.vanilla_queries
else 0
)
return f"""## Neural Memory vs RAG Comparison
| Metric | Neural Memory | RAG |
|--------|---------------|-----|
| **Queries** | {metrics.nm_queries} | {metrics.vanilla_queries} |
| **Pattern-Based Answers** | {metrics.nm_correct} ({nm_accuracy:.0f}%) | N/A |
| **Retrieval Failures** | N/A | {metrics.vanilla_hallucinations} ({rag_fail_rate:.0f}%) |
| **Avg Response Time** | {nm_avg_time:.2f}s | {rag_avg_time:.2f}s |
### Knowledge Base: {len(knowledge_base)} facts stored
### Why Neural Memory Wins
| Capability | Neural Memory | RAG |
|------------|---------------|-----|
| **Pattern Learning** | Learns trends across all data | No learning |
| **Inference** | Can predict from patterns | Only retrieves matches |
| **Context Usage** | Uses ALL facts holistically | Uses top-k retrieved |
| **Novelty Detection** | Surprise score | None |
| **Memory Size** | Fixed (neural weights) | Grows with data |
### Key Insight
Neural memory **learns patterns** (e.g., "Carlos rejects bright colors, approves dark themes")
and can **infer preferences** for novel items. RAG just retrieves similar documents.
"""
def reset_comparison() -> Tuple[str, plt.Figure, plt.Figure, plt.Figure]:
"""Reset comparison metrics and knowledge base."""
global metrics, knowledge_base, embeddings_store
metrics = ComparisonMetrics()
knowledge_base = []
embeddings_store = []
return (
"Comparison reset. Knowledge base and embeddings cleared.",
create_tsne_visualization(),
create_neural_memory_state_visualization(),
create_knowledge_base_visualization(),
)
def reset_memory():
"""Reset to fresh memory state."""
global memory, observation_history
memory = NeuralMemory(MemoryConfig(dim=256, learning_rate=0.02))
observation_history = []
return "Memory reset. Fresh neural network initialized."
# =============================================================================
# VISUALIZATION
# =============================================================================
def get_weight_sample() -> np.ndarray:
"""Extract 16x16 sample of actual neural weights."""
with torch.no_grad():
# Get weights from first linear layer
weights = memory.memory_net[0].weight.data[:16, :16]
return weights.cpu().numpy()
def create_weight_visualization() -> plt.Figure:
"""Visualize actual neural network weights."""
weights = get_weight_sample()
fig, ax = plt.subplots(figsize=(6, 5))
im = ax.imshow(weights, cmap="RdBu_r", aspect="auto", vmin=-0.5, vmax=0.5)
ax.set_title(
f"Neural Memory Weights\n({sum(p.numel() for p in memory.memory_net.parameters()):,} parameters)",
fontsize=12,
fontweight="bold",
)
ax.set_xlabel("These weights UPDATE during inference (TTT)")
ax.axis("off")
plt.colorbar(im, ax=ax, label="Weight Value")
plt.tight_layout()
return fig
def create_history_plot() -> plt.Figure:
"""Plot surprise history."""
fig, ax = plt.subplots(figsize=(8, 3))
if observation_history:
surprises = [h["surprise"] for h in observation_history]
x = range(1, len(surprises) + 1)
ax.plot(x, surprises, "o-", color="#e74c3c", linewidth=2, markersize=8)
ax.axhline(y=0.5, color="gray", linestyle="--", alpha=0.5, label="Threshold")
ax.set_xlabel("Observation #")
ax.set_ylabel("Surprise")
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3)
ax.legend()
else:
ax.text(0.5, 0.5, "No observations yet", ha="center", va="center", fontsize=12, color="gray")
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_title("Learning Progress (Surprise Over Time)", fontsize=12, fontweight="bold")
plt.tight_layout()
return fig
# =============================================================================
# CORE MEMORY OPERATIONS
# =============================================================================
def observe_content(content: str) -> tuple[str, plt.Figure, plt.Figure]:
"""
Feed content to REAL neural memory - triggers actual gradient updates.
"""
if not content.strip():
return "Please enter content to observe.", None, None
# Get weight hash BEFORE
hash_before = memory.get_weight_hash()
# REAL observation with actual gradient descent
result = memory.observe(content)
# Get weight hash AFTER
hash_after = memory.get_weight_hash()
# Record history
observation_history.append({
"content": content[:50],
"surprise": result["surprise"],
"weight_delta": result["weight_delta"],
"learned": result["learned"],
})
# Format result
weights_changed = hash_before != hash_after
output = f"""## Observation Result
**Content:** "{content[:100]}{'...' if len(content) > 100 else ''}"
### Metrics (REAL - from PyTorch gradient descent)
| Metric | Value |
|--------|-------|
| **Surprise** | {result['surprise']:.4f} |
| **Weight Delta** | {result['weight_delta']:.6f} |
| **Weights Changed** | {'YES' if weights_changed else 'NO'} |
| **Hash Before** | `{hash_before}` |
| **Hash After** | `{hash_after}` |
### What Just Happened
1. Text was encoded to tensor representation
2. Forward pass through neural memory network
3. **Surprise computed** via prediction error (MSE loss)
4. **Gradients calculated** via `torch.autograd.grad()`
5. **Weights updated** via gradient descent: `param -= lr * grad`
This is REAL test-time training. The neural network's weights physically changed.
"""
return output, create_weight_visualization(), create_history_plot()
def check_surprise(content: str) -> str:
"""Check surprise WITHOUT learning."""
if not content.strip():
return "Please enter content to check."
# REAL surprise computation (no learning)
surprise = memory.surprise(content)
return f"""## Surprise Check (No Learning)
**Content:** "{content[:100]}{'...' if len(content) > 100 else ''}"
**Surprise Score:** {surprise:.4f}
Interpretation:
- **< 0.3**: Very familiar - memory has seen similar patterns
- **0.3 - 0.6**: Moderately novel
- **> 0.6**: Highly novel - worth learning
{'This content is FAMILIAR to the memory.' if surprise < 0.3 else 'This content is NOVEL to the memory.' if surprise > 0.6 else 'This content is somewhat familiar.'}
"""
def get_memory_stats() -> str:
"""Get real memory statistics."""
stats = memory.get_stats()
return f"""## Memory Statistics
| Metric | Value |
|--------|-------|
| **Total Observations** | {stats['total_observations']} |
| **Parameters** | {stats['weight_parameters']:,} |
| **Dimension** | {stats['dimension']} |
| **Learning Rate** | {stats['learning_rate']:.4f} |
| **Avg Recent Surprise** | {stats['avg_surprise']:.4f} |
| **Current Weight Hash** | `{memory.get_weight_hash()}` |
### This is a Real Neural Network
- **Architecture**: 2-layer MLP with GELU activation and LayerNorm
- **Framework**: PyTorch with autograd
- **Learning**: Test-time training via gradient descent
- **Memory**: ~{stats['weight_parameters'] * 4 / 1024:.1f} KB of weights
Unlike RAG which stores vectors in a database, this IS the memory.
The weights encode everything learned.
"""
# =============================================================================
# KEY CONCEPTS (New Educational Tab)
# =============================================================================
KEY_CONCEPTS_HTML = '''
β The Problem: LLMs Have No Memory
Every API call to an LLM starts fresh:
// Call 1
User: "My name is Carlos"
AI: "Nice to meet you, Carlos!"
// Call 2 (new session)
User: "What's my name?"
AI: "I don't know your name."
The model's weights are frozen after training:
- Can't learn new information
- Can't remember past conversations
- Can't adapt to user preferences
- Knowledge is static (training cutoff)
Two Solutions to Add Memory
π Solution A: RAG (Retrieval)
Store information externally, retrieve relevant pieces when needed.
HOW
- Convert facts to vectors (embeddings)
- Store in vector database
- On query, find similar vectors
- Pass retrieved docs to LLM
β Simple
β Scalable
β No patterns
β Grows
π§ Solution B: Neural Memory (Learning)
Learn information into neural weights. Memory IS the network.
HOW
- Encode fact as tensor
- Forward pass through neural net
- Compute prediction error (surprise)
- Update weights via gradient descent
β Learns patterns
β Fixed size
β Can infer
β Complex
β‘ The Innovation: Test-Time Training (TTT)
Traditional Training
Train once β Freeze weights β Deploy
Model can't learn after deployment
Test-Time Training (Titans)
Deploy β Continue learning β Weights update
Model learns from every interaction
This demo implements real TTT: When you add a fact, actual PyTorch gradients flow and actual neural network weights change. This is not a simulationβit's the Titans architecture from Google's December 2024 paper.
'''
# =============================================================================
# INCREMENTAL INTEGRATION DIAGRAMS
# =============================================================================
VANILLA_LLM_DIAGRAM_HTML = '''
Step 1
Vanilla LLM (The Problem)
π€
User Query
"What's my preference?"
β
β
β
No Memory
"I don't know"
β οΈ LLM has no way to remember user-specific information between sessions
'''
RAG_INTEGRATION_DIAGRAM_HTML = '''
Step 2a
Adding RAG (Retrieval-Augmented Generation)
β
π
Retriever
keyword match
β
ποΈ
Vector DB
top-k docs
β
π
Context
prompt injection
β
β External memory storage
β No pattern learning
'''
NEURAL_MEMORY_INTEGRATION_DIAGRAM_HTML = '''
Step 2b
Adding Neural Memory (Test-Time Training)
β
π§
Neural Memory
TTT Module
β
β
π
Rich Context
all facts + hints
β
β Learns patterns
β Fixed memory size
β Can infer/predict
'''
DOCKER_DEPLOYMENT_DIAGRAM_HTML = '''
Step 3
Docker Deployment (Production Ready)
π³
Docker Container
π§ Neural Memory
PyTorch TTT Module
π MCP Server
Claude Desktop Integration
π HTTP API
REST Endpoints
Why Docker? Learned neural weights persist across container restarts via Docker volumes. Deploy anywhere with identical behavior. Version control your AI's memory state like Git commits.
'''
# =============================================================================
# DOCKER ECOSYSTEM INTEGRATION
# =============================================================================
DOCKER_INTEGRATION_MD = """
## Docker Ecosystem Integration
This neural memory is designed for **containerized deployment** with full Docker integration.
### MCP Server Interface
The memory exposes tools via Model Context Protocol (MCP):
```python
# MCP Tools Available
@mcp.tool()
def observe(content: str) -> dict:
'''Feed context, trigger learning.'''
return memory.observe(content)
@mcp.tool()
def surprise(content: str) -> float:
'''Measure novelty without learning.'''
return memory.surprise(content)
@mcp.tool()
def checkpoint(name: str) -> str:
'''Save learned state to Docker volume.'''
return save_checkpoint(name)
@mcp.tool()
def restore(name: str) -> str:
'''Load previous state from Docker volume.'''
return load_checkpoint(name)
```
### Docker Compose Deployment
```yaml
version: '3.8'
services:
neural-memory:
build: .
ports:
- "8000:8000" # MCP server
volumes:
- memory-state:/app/checkpoints # Persistent state
environment:
- MEMORY_DIM=512
- LEARNING_RATE=0.01
volumes:
memory-state: # State survives container restarts
```
### Key Docker-Native Features
| Feature | Implementation |
|---------|---------------|
| **State Persistence** | Docker volumes for checkpoints |
| **Horizontal Scaling** | Stateless inference, shared state via volume |
| **CI/CD Integration** | GitHub Actions with Docker build |
| **Resource Control** | Container limits for GPU/memory |
| **Health Checks** | `/health` endpoint with memory stats |
### Why Docker + Neural Memory?
1. **Containerized AI Memory**: Package learned state with your app
2. **Version Control**: Checkpoint states like Git commits
3. **Reproducibility**: Same container = same behavior
4. **Orchestration Ready**: Deploy to Kubernetes, ECS, etc.
5. **MCP Protocol**: Claude Desktop integration via container
---
*This project demonstrates production-grade AI infrastructure with Docker.*
"""
# =============================================================================
# ARCHITECTURE DIAGRAMS (How It Works)
# =============================================================================
ARCHITECTURE_INTRO_MD = """
## How It Works: Neural Memory vs RAG Architecture
This section provides a detailed look at how both systems process information and connect to the LLM.
The diagrams below are **faithful representations of our actual implementation**.
---
"""
NEURAL_MEMORY_DIAGRAM_HTML = """
Neural Memory Pipeline (Test-Time Training)
Phase 1: Learning (When Facts Are Added)
π
User Fact
"Carlos rejected
bright colors"
β
π’
_encode_text()
Tensor [1, 64, 256]
β
π§
memory_net(x)
2-layer MLP
~250K params
β
π
MSE Loss
Surprise Score
= Prediction Error
β
β‘
WEIGHT UPDATE
torch.autograd.grad()
param -= lr Γ grad
Key Point: The neural network's weights physically change after each fact.
This is real gradient descent happening at inference time (Test-Time Training / Titans architecture).
Phase 2: Query (When Questions Are Asked)
β
Question
"Will Carlos
like neon?"
β
π―
surprise()
Novelty Score
(No Learning)
β
π¦
Build Context
ALL facts
+ Pattern hints
+ Surprise score
β
π¬
System Prompt
"You have LEARNED from:
β’ All 4 observations
β’ Identify PATTERNS
β’ Make INFERENCES"
β
π€
LLM
SmolLM3-3B
(HuggingFace)
Key Point: The LLM receives ALL facts + learned pattern hints + novelty indicator.
It's instructed to identify patterns and make inferences, not just retrieve.
"""
RAG_DIAGRAM_HTML = """
RAG Pipeline (Retrieval-Augmented Generation)
Phase 1: Storage (When Facts Are Added)
π
User Fact
"Carlos rejected
bright colors"
β
π
knowledge_base.append()
Simple list storage
No transformation
β
ποΈ
Document Store
List of strings
Grows with data
Key Point: Facts are simply stored as-is. No learning occurs.
The system doesn't understand relationships or patterns between facts.
Phase 2: Query (When Questions Are Asked)
β
Question
"Will Carlos
like neon?"
β
βοΈ
Tokenize
Split into words
{"will", "carlos",
"like", "neon"}
β
π
Keyword Overlap
Count matching words
between Q and each fact
β
π
Top-2 Facts
Only highest
overlap scores
β
π¬
System Prompt
"You are a RAG system.
ONLY use retrieved facts.
If not covered, say so."
β
π€
LLM
SmolLM3-3B
(Same model!)
Key Point: The LLM only sees 2 retrieved facts (not all 4).
"neon" β "bright" keyword-wise, so relevant facts may not be retrieved!
"""
LLM_INTEGRATION_MD = """
---
## How Each System Connects to the LLM
Both systems use the **exact same LLM** (HuggingFace SmolLM3-3B). The difference is **what context they provide**.
### Neural Memory β LLM Connection
```
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β SYSTEM PROMPT (Neural Memory) β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β "You are an AI with neural memory that has LEARNED from all β
β observations below. Unlike simple retrieval, you should: β
β β
β 1. Consider ALL facts holistically β
β 2. Identify PATTERNS across multiple observations β
β 3. Make INFERENCES based on learned patterns β
β 4. Predict based on trends, not just direct matches β
β β
β Observations (learned knowledge): β
β - Carlos rejected the bright colorful design β
β - Carlos rejected the flashy animated homepage β
β - Carlos approved the minimalist dark layout β
β - Carlos approved the clean monochrome interface β
β β
β Learned patterns from observations: β
β - Positive signals: 2 approvals β
β - Negative signals: 2 rejections β
β - Look for common themes in approved vs rejected items β
β β
β Question novelty (surprise score): 0.89 β
β - High surprise (>0.6): This is a novel topic, be cautious" β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β USER: "We have a new UI mockup with neon colors - will Carlos β
β like it?" β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
```
**What the Neural Memory provides:**
| Component | Purpose |
|-----------|---------|
| **ALL facts** | Complete context for holistic reasoning |
| **Pattern hints** | Extracted approval/rejection counts |
| **Surprise score** | Indicates if question is familiar or novel |
| **Inference instructions** | Tells LLM to identify patterns and predict |
---
### RAG β LLM Connection
```
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β SYSTEM PROMPT (RAG) β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β "You are a RAG system. You can ONLY use the retrieved facts below β
β to answer. If the retrieved facts don't directly answer the β
β question, say 'The retrieved information doesn't cover this.' β
β β
β Retrieved facts: β
β - Carlos rejected the bright colorful design β
β (Only 1 fact retrieved - 'neon' didn't match other keywords!) β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β USER: "We have a new UI mockup with neon colors - will Carlos β
β like it?" β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
```
**What RAG provides:**
| Component | Purpose |
|-----------|---------|
| **Top-2 facts only** | Limited context based on keyword overlap |
| **No pattern info** | System doesn't analyze relationships |
| **No novelty signal** | No indication of question familiarity |
| **Strict retrieval instructions** | Tells LLM to only use retrieved facts |
---
## The Critical Difference: What Goes Into the LLM
"""
COMPARISON_TABLE_HTML = """
Side-by-Side: What the LLM Receives
π§ Neural Memory
β
Facts Provided:
ALL 4 facts (complete knowledge)
β
Pattern Analysis:
β’ 2 approvals identified
β’ 2 rejections identified
β’ "Look for common themes"
β
Novelty Signal:
Surprise score: 0.89 (novel topic)
β
Instructions:
"Identify PATTERNS"
"Make INFERENCES"
"Predict based on trends"
π RAG
β οΈ Facts Provided:
Only 1-2 facts (keyword match)
"neon" β "bright", "colorful", etc.
β Pattern Analysis:
None - no relationship detection
β Novelty Signal:
None - no familiarity indicator
β οΈ Instructions:
"ONLY use retrieved facts"
"If not covered, say so"
No inference allowed
"""
ARCHITECTURE_SUMMARY_MD = """
---
## Why This Architecture Matters
### The Learning Advantage
| Aspect | Neural Memory | RAG |
|--------|---------------|-----|
| **Storage** | Fixed neural weights (~250K params) | Growing document store |
| **Learning** | Yes - weights update per observation | No - just stores text |
| **Retrieval** | Not needed - patterns in weights | Required - keyword matching |
| **Inference** | Can generalize to novel queries | Limited to direct matches |
| **Memory Size** | Constant (doesn't grow) | Linear growth with data |
### When Neural Memory Wins
The architecture shines when:
1. **Pattern Recognition Required** - "Carlos likes X, dislikes Y" β predict for Z
2. **Novel Queries** - Question keywords don't match stored facts
3. **Holistic Reasoning** - Answer requires synthesizing multiple facts
4. **Bounded Memory** - Can't afford growing storage
### When RAG Might Be Better
RAG is simpler when:
1. **Exact Retrieval** - "What did the document say about X?"
2. **Large Corpus** - Millions of documents to search
3. **No Patterns** - Facts are independent, not related
4. **Transparency** - Need to cite exact source documents
---
## Technical Implementation Details
### Neural Memory Architecture
```
Input Text
β
βΌ
βββββββββββββββββββββββββββββββββββββββββββββββ
β _encode_text(text) β
β βββββββββββββββββββββββββββββββββββββββββββ β
β β 1. Convert to ASCII ordinals β β
β β 2. Pad/truncate to max_seq_len (64) β β
β β 3. Project to dimension (256) β β
β β 4. Output: Tensor [1, 64, 256] β β
β βββββββββββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββββββββββββββββββββββββββ
β
βΌ
βββββββββββββββββββββββββββββββββββββββββββββββ
β memory_net (nn.Sequential) β
β βββββββββββββββββββββββββββββββββββββββββββ β
β β Linear(256 β 1024) β β
β β GELU activation β β
β β LayerNorm(1024) β β
β β Linear(1024 β 256) β β
β βββββββββββββββββββββββββββββββββββββββββββ β
β Total: ~262K parameters β
βββββββββββββββββββββββββββββββββββββββββββββββ
β
βΌ
βββββββββββββββββββββββββββββββββββββββββββββββ
β _compute_surprise_tensor(input, output) β
β βββββββββββββββββββββββββββββββββββββββββββ β
β β loss = MSE(output, target) β β
β β surprise = sigmoid(loss) scaled to 0-1 β β
β βββββββββββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββββββββββββββββββββββββββ
β
βΌ
βββββββββββββββββββββββββββββββββββββββββββββββ
β _update_weights(loss) [IF learn=True] β
β βββββββββββββββββββββββββββββββββββββββββββ β
β β grads = torch.autograd.grad(loss, ΞΈ) β β
β β for each (param, grad): β β
β β param -= learning_rate Γ grad β β
β βββββββββββββββββββββββββββββββββββββββββββ β
β β‘ This is the key innovation! β
βββββββββββββββββββββββββββββββββββββββββββββββ
```
### RAG Architecture (Simplified for Demo)
```
Input Text
β
βΌ
βββββββββββββββββββββββββββββββββββββββββββββββ
β knowledge_base.append({"fact": text, ...}) β
β Simple list storage - no transformation β
βββββββββββββββββββββββββββββββββββββββββββββββ
Query
β
βΌ
βββββββββββββββββββββββββββββββββββββββββββββββ
β Keyword Overlap Scoring β
β βββββββββββββββββββββββββββββββββββββββββββ β
β β question_words = set(query.split()) β β
β β for fact in knowledge_base: β β
β β fact_words = set(fact.split()) β β
β β score = len(question_words β© fact_ β β
β β words) β β
β βββββββββββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββββββββββββββββββββββββββ
β
βΌ
βββββββββββββββββββββββββββββββββββββββββββββββ
β Top-K Selection (K=2 in our demo) β
β Return facts with highest overlap scores β
βββββββββββββββββββββββββββββββββββββββββββββββ
```
---
*These diagrams represent the actual implementation in this demo. The code is open source.*
"""
ABOUT_MD = """
## About This Project
### What Makes This Special
This is **NOT a simulation**. The demo runs real PyTorch code:
1. **Real Neural Network**: 2-layer MLP with ~250K parameters
2. **Real Gradient Descent**: `torch.autograd.grad()` computes gradients
3. **Real Weight Updates**: Parameters change during inference
4. **Real Surprise Metric**: MSE loss measures prediction error
### The Titans Architecture
Based on Google's December 2024 paper: [arxiv.org/abs/2501.00663](https://arxiv.org/abs/2501.00663)
**Key Innovation**: The memory IS a neural network. Instead of storing vectors,
it learns patterns by updating weights during inference.
### Docker Integration
- **MCP Server**: Model Context Protocol for Claude Desktop
- **Checkpoints**: Save/restore learned state via Docker volumes
- **Container-Native**: Designed for orchestrated deployment
---
## Limitations
This is a **demonstration project**, not a production-ready system:
| Component | Current State | Production Would Need |
|-----------|---------------|----------------------|
| **RAG Implementation** | Simplified keyword matching | Vector embeddings + semantic search (FAISS, Pinecone) |
| **Neural Memory** | Basic 2-layer MLP | Deeper architecture, attention mechanisms |
| **Scalability** | Single-user demo | Distributed inference, GPU optimization |
| **Evaluation** | Qualitative comparison | Benchmarks, ablation studies, metrics |
| **Memory Capacity** | ~250K parameters | Larger models, hierarchical memory |
The RAG comparison uses simple word overlap scoring to demonstrate *why* keyword-based retrieval fails for pattern inference. A production RAG system would use proper embeddings and vector similarity search.
---
## Acknowledgments
This project builds on the work of brilliant researchers:
**Core Research:**
- **Titans: Learning to Memorize at Test Time** (Google, Dec 2024) β [arXiv:2501.00663](https://arxiv.org/abs/2501.00663)
- Ali Behrouz, Peilin Zhong, Vahab Mirrokni
- **Learning to (Learn at Test Time): RNNs with Expressive Hidden States** (Stanford/Meta, Jul 2024) β [arXiv:2407.04620](https://arxiv.org/abs/2407.04620)
- Yu Sun, Xinhao Li, Karan Dalal, et al.
**Frameworks & Tools:**
- [PyTorch](https://pytorch.org/) β The foundation for neural memory implementation
- [Gradio](https://gradio.app/) β Interactive demo interface
- [HuggingFace](https://huggingface.co/) β Model hosting and inference API
- [Model Context Protocol](https://modelcontextprotocol.io/) β Claude Desktop integration
**Inspiration:**
- The broader ML community exploring alternatives to attention-based memory
- Open-source contributors who make research accessible
---
## Next Steps
Potential improvements for future iterations:
1. **Real RAG Baseline**: Integrate sentence-transformers + FAISS for proper semantic retrieval comparison
2. **Attention-Based Memory**: Implement the full Titans architecture with neural long-term memory gates
3. **Benchmarking**: Add quantitative evaluation on standard memory tasks (bAbI, etc.)
4. **Multi-Modal Support**: Extend to image/audio observations
5. **Distributed Memory**: Explore memory sharing across multiple agents
6. **Fine-Grained Forgetting**: Implement selective memory consolidation/pruning
---
## Built By
**Carlos Crespo Macaya**
AI Engineer β GenAI Systems & Applied MLOps
This project demonstrates the ability to:
1. Read cutting-edge research (Titans paper)
2. Implement it correctly (PyTorch TTT)
3. Productionize it (Docker, MCP, CI/CD)
4. Communicate it effectively (this demo)
"""
# =============================================================================
# GRADIO INTERFACE
# =============================================================================
with gr.Blocks(title="Docker Neural Memory", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
# Branded header
gr.HTML(HEADER_HTML)
with gr.Tabs():
# TAB 1: Comparison Demo (NEW - Main Feature)
with gr.TabItem("LLM Comparison"):
gr.Markdown("""
### Neural Memory vs RAG (Retrieval-Augmented Generation)
**Step 1:** Teach the system facts about preferences/patterns
**Step 2:** Ask questions that require **inference**, not just retrieval
**RAG** retrieves similar documents but can't learn patterns.
**Neural Memory** learns from ALL observations and can infer from trends.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("#### Step 1: Teach Facts")
fact_input = gr.Textbox(
label="Add a Fact",
placeholder="e.g., 'Carlos prefers VSCode over Vim'",
lines=2,
)
add_fact_btn = gr.Button("Add to Knowledge Base", variant="secondary")
fact_output = gr.Markdown()
gr.Markdown("#### Example Facts to Try")
gr.Markdown("""
**Scenario: Learning User Preferences (Pattern Recognition)**
1. "Carlos rejected the bright colorful design"
2. "Carlos rejected the flashy animated homepage"
3. "Carlos approved the minimalist dark layout"
4. "Carlos approved the clean monochrome interface"
Then ask: **"We have a new UI mockup with neon colors - will Carlos like it?"**
*Neural Memory learns the pattern (Carlos prefers dark/minimal). RAG just retrieves similar facts without inferring the preference pattern.*
""")
with gr.Column(scale=1):
gr.Markdown("#### Knowledge Base (RAG Store)")
kb_plot = gr.Plot(label="Facts Stored")
# Visualizations row
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("#### Neural Memory State")
neural_state_plot = gr.Plot(label="Neural Network Weights & Stats")
with gr.Column(scale=1):
gr.Markdown("#### Embedding Space")
tsne_plot = gr.Plot(label="t-SNE/PCA Visualization")
add_fact_btn.click(
add_to_knowledge_base,
inputs=[fact_input],
outputs=[fact_output, tsne_plot, neural_state_plot, kb_plot]
)
gr.Markdown("---")
gr.Markdown("#### Step 2: Ask Questions & Compare Responses")
with gr.Row():
with gr.Column(scale=2):
question_input = gr.Textbox(
label="Ask a Question",
placeholder="e.g., 'We have a new UI mockup with neon colors - will Carlos like it?'",
lines=2,
)
with gr.Column(scale=1):
gr.Markdown("""
**Best Questions for Neural Memory:**
- Questions requiring **pattern inference**
- Questions about **preferences/trends**
- Questions needing **generalization**
""")
with gr.Row():
compare_btn = gr.Button("Compare Responses", variant="primary", size="lg")
reset_compare_btn = gr.Button("Reset Comparison", variant="secondary")
# Response display - side by side with clear headers
with gr.Row():
with gr.Column():
gr.Markdown("##### Neural Memory Response")
nm_response = gr.Markdown()
with gr.Column():
gr.Markdown("##### RAG Response")
vanilla_response = gr.Markdown()
comparison_summary = gr.Markdown(label="Comparison Metrics")
compare_btn.click(
compare_responses,
inputs=[question_input],
outputs=[nm_response, vanilla_response, comparison_summary, neural_state_plot, kb_plot],
)
reset_compare_btn.click(
reset_comparison,
outputs=[comparison_summary, tsne_plot, neural_state_plot, kb_plot]
)
# TAB 2: How It Works (Architecture Diagrams)
with gr.TabItem("How It Works"):
gr.Markdown(ARCHITECTURE_INTRO_MD)
# Neural Memory Diagram
gr.HTML(NEURAL_MEMORY_DIAGRAM_HTML)
# RAG Diagram
gr.HTML(RAG_DIAGRAM_HTML)
# LLM Integration Explanation
gr.Markdown(LLM_INTEGRATION_MD)
# Side-by-side comparison table
gr.HTML(COMPARISON_TABLE_HTML)
# Architecture Summary
gr.Markdown(ARCHITECTURE_SUMMARY_MD)
# TAB 3: Key Concepts
with gr.TabItem("Key Concepts"):
gr.HTML(KEY_CONCEPTS_HTML)
# TAB 4: Integration & Docker
with gr.TabItem("Integration & Docker"):
gr.Markdown("## How Memory Modules Integrate with LLMs")
gr.Markdown("Follow this incremental explanation to understand how both RAG and Neural Memory attach to a vanilla LLM.")
# Step 1: Vanilla LLM
gr.HTML(VANILLA_LLM_DIAGRAM_HTML)
# Step 2a: RAG Integration
gr.HTML(RAG_INTEGRATION_DIAGRAM_HTML)
# Step 2b: Neural Memory Integration
gr.HTML(NEURAL_MEMORY_INTEGRATION_DIAGRAM_HTML)
# Step 3: Docker Deployment
gr.HTML(DOCKER_DEPLOYMENT_DIAGRAM_HTML)
# Docker details
gr.Markdown(DOCKER_INTEGRATION_MD)
# TAB 5: About
with gr.TabItem("About"):
gr.Markdown(ABOUT_MD)
# Polished footer with profile links
gr.HTML(FOOTER_HTML)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)