Upload 8 files

Browse files

Files changed (8) hide show

decoder.py +103 -0
formatter.py +77 -0
main.py +79 -0
pre_embed.py +55 -0
test_chat.py +91 -0
test_embed.py +18 -0
test_full.py +203 -0
testcuda.py +6 -0

decoder.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import pandas as pd
+from transformers import T5Tokenizer
+from sentence_transformers import SentenceTransformer
+# ===== CONFIG =====
+INPUT_FILE = "chat_1turn.csv"
+EMB_FILE = "chat_embeddings.pt"
+MODEL_NAME = "Snowflake/snowflake-arctic-embed-l-v2.0"
+EPOCHS = 80
+BATCH_SIZE = 16
+HIDDEN_DIM = 512
+MAX_LEN = 64
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# ===== Load CSV =====
+df = pd.read_csv(INPUT_FILE)
+sources = df["source"].fillna("").tolist()
+targets = df["target"].fillna("").tolist()
+# ===== Tokenizer =====
+tokenizer = T5Tokenizer.from_pretrained("t5-small")
+target_enc = tokenizer(targets, padding=True, truncation=True,
+                       return_tensors="pt", max_length=MAX_LEN)
+input_ids = target_enc["input_ids"].to(device)
+attention_mask = target_enc["attention_mask"].to(device)
+# ===== Load embeddings =====
+emb_data = torch.load(EMB_FILE)
+x_embeddings = emb_data["source"].to(device)  # not used directly in this training
+y_embeddings = emb_data["target"].to(device)  # used to condition decoder
+# ===== Decoder =====
+class EmbeddingDecoder(nn.Module):
+    def __init__(self, input_dim, hidden_dim, vocab_size):
+        super().__init__()
+        self.bridge = nn.Linear(input_dim, hidden_dim)
+        self.embed = nn.Embedding(vocab_size, hidden_dim)
+        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
+        self.fc = nn.Linear(hidden_dim, vocab_size)
+    def forward(self, emb_vec, target_ids=None, teacher_forcing_ratio=0.5, max_len=MAX_LEN):
+        hidden = self.bridge(emb_vec).unsqueeze(0)  # [1,B,H]
+        B = emb_vec.size(0)
+        outputs = []
+        # start with pad_token (T5 has pad=0, eos=1)
+        inp = torch.full((B,1), tokenizer.pad_token_id, device=emb_vec.device)
+        for t in range(max_len):
+            inp_emb = self.embed(inp)  # [B,1,H]
+            out, hidden = self.gru(inp_emb, hidden)  # [B,1,H]
+            logits = self.fc(out.squeeze(1))  # [B,V]
+            outputs.append(logits.unsqueeze(1))
+            if target_ids is not None and t < target_ids.size(1) and torch.rand(1).item() < teacher_forcing_ratio:
+                inp = target_ids[:, t].unsqueeze(1)
+            else:
+                inp = torch.argmax(logits, dim=-1, keepdim=True)
+        return torch.cat(outputs, dim=1)  # [B, max_len, V]
+# ===== Train =====
+decoder = EmbeddingDecoder(y_embeddings.shape[1], HIDDEN_DIM, tokenizer.vocab_size).to(device)
+optimizer = optim.Adam(decoder.parameters(), lr=1e-3)
+criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
+print("Training decoder...")
+for epoch in range(EPOCHS):
+    decoder.train()
+    total_loss = 0.0
+    for i in range(0, len(y_embeddings), BATCH_SIZE):
+        xb = y_embeddings[i:i+BATCH_SIZE]
+        yb = input_ids[i:i+BATCH_SIZE]
+        optimizer.zero_grad()
+        logits = decoder(xb, target_ids=yb, teacher_forcing_ratio=0.7, max_len=yb.size(1))
+        loss = criterion(logits.reshape(-1, logits.size(-1)), yb.reshape(-1))
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss:.4f}")
+# ===== Inference =====
+embedder = SentenceTransformer(MODEL_NAME, device=device)
+def generate(text, max_len=30, use_mapper=False, mapper=None):
+    with torch.no_grad():
+        # embed new text
+        emb = embedder.encode([text], convert_to_tensor=True, device=device)
+        if use_mapper and mapper is not None:
+            emb = mapper(emb)
+        logits = decoder(emb, target_ids=None, teacher_forcing_ratio=0.0, max_len=max_len)
+        ids = torch.argmax(logits, dim=-1).squeeze(0).tolist()
+        return tokenizer.decode(ids, skip_special_tokens=True)
+# ===== Test =====
+print("Hi ->", generate("Hi"))

formatter.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from datasets import load_dataset
+from transformers import T5Tokenizer
+import pandas as pd, csv, re
+from tqdm import tqdm
+# ── Config ────────────────────────────────────────────────────────────────
+jsonl_path       = "lmsys_chat_1m_full.jsonl"   # local file
+use_subset       = False                         # False ⇒ full 1 M rows
+num_samples      = 500                      # if subset
+max_turn_pairs   = 1                           # 4 user+assistant = 8 lines
+max_input_tokens = 512                         # fits t5-small/base
+# ──────────────────────────────────────────────────────────────────────────
+tok = T5Tokenizer.from_pretrained("t5-small")
+ds  = load_dataset("json", data_files=jsonl_path, split="train")
+if use_subset:
+    ds = ds.select(range(min(num_samples, len(ds))))
+    print(f"🔍 subset → {len(ds)} rows")
+def mostly_ascii(s: str, threshold: float = .3) -> bool:
+    try:
+        return sum(ord(ch) > 127 for ch in s) / len(s) < threshold
+    except ZeroDivisionError:
+        return False
+def format_turns(conv):
+    return [f"{m['role'].capitalize()}: {m['content'].strip()}" for m in conv]
+def build_pair(turns, max_tokens=512):
+    if len(turns) < max_turn_pairs * 2:
+        return None
+    # last N pairs
+    use_turns = turns[-(max_turn_pairs * 2):]
+    prompt = "chat:\n\n" + "\n\n".join(use_turns[:-1])
+    target = use_turns[-1].replace("Assistant: ", "", 1)
+    # --- safe trimming loop --------------------------------------------
+    for _ in range(max_turn_pairs):          # at most 4 trims if max_turn_pairs=4
+        if len(tok.tokenize(prompt)) <= max_tokens:
+            break                            # fits → good
+        sep_pos = prompt.find("\n\n", len("chat:\n\n"))
+        if sep_pos == -1:                    # no more turns to drop
+            return None
+        prompt = "chat:\n\n" + prompt[sep_pos + 2:]
+    else:
+        # still too long after all trims
+        return None
+    # -------------------------------------------------------------------
+    if len(prompt) < 30 or len(target) < 10:
+        return None
+    if not mostly_ascii(prompt + target):
+        return None
+    return prompt, target
+rows, kept = [], 0
+for ex in tqdm(ds, desc="formatting"):
+    conv = ex.get("conversation")
+    if not isinstance(conv, list): continue
+    p = build_pair(format_turns(conv))
+    if p:
+        rows.append({"source": p[0], "target": p[1]})
+        kept += 1
+print(f"✅ kept {kept} examples")
+pd.DataFrame(rows).to_csv(
+    "chat_1turn.csv",
+    index=False,
+    quoting=csv.QUOTE_ALL,     # preserves embedded newlines
+    encoding="utf-8"
+)
+print("💾 saved → t5_chat_4turn.csv")

main.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import matplotlib.pyplot as plt
+# ===== Load precomputed embeddings =====
+emb_data = torch.load("chat_embeddings.pt")
+x_embeddings = emb_data["source"]  # [N, D]
+y_embeddings = emb_data["target"]  # [N, D]
+print("Source shape:", x_embeddings.shape)
+print("Target shape:", y_embeddings.shape)
+embedding_dim = x_embeddings.shape[1]
+num_samples = x_embeddings.shape[0]
+# ===== Device =====
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+x_embeddings = x_embeddings.to(device)
+y_embeddings = y_embeddings.to(device)
+# ===== Define model =====
+class SemanticMapper(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim * 2),
+            nn.ReLU(),
+            nn.Linear(dim * 2, dim)
+        )
+    def forward(self, x):
+        return self.net(x)
+model = SemanticMapper(embedding_dim).to(device)
+optimizer = optim.Adam(model.parameters(), lr=0.001)
+criterion = nn.CosineEmbeddingLoss()
+# ===== Training config =====
+epochs = 20
+batch_size = 64
+loss_history = []
+# ===== Training loop =====
+for epoch in range(epochs):
+    perm = torch.randperm(num_samples, device=device)
+    epoch_loss = 0.0
+    for i in range(0, num_samples, batch_size):
+        idx = perm[i:i + batch_size]
+        x_batch = x_embeddings[idx]
+        y_batch = y_embeddings[idx]
+        target = torch.ones(x_batch.size(0), device=device)  # cosine target = +1
+        y_pred = model(x_batch)
+        loss = criterion(y_pred, y_batch, target)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        epoch_loss += loss.item()
+    avg_loss = epoch_loss / (num_samples / batch_size)
+    loss_history.append(avg_loss)
+    print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.6f}")
+# ===== Plot loss curve =====
+plt.plot(loss_history, marker="o")
+plt.title("Training Loss (Cosine Similarity)")
+plt.xlabel("Epoch")
+plt.ylabel("Loss")
+plt.grid(True)
+plt.show()
+# Save the trained model
+torch.save(model.state_dict(), "semantic_mapper.pth")

pre_embed.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import time
+import os
+# CONFIGURATION
+INPUT_FILE = "chat_1turn.csv"
+OUTPUT_FILE = "chat_embeddings.pt"
+MODEL_NAME = "Snowflake/snowflake-arctic-embed-l-v2.0"
+BATCH_SIZE = 128  # Go big or go slow
+USE_GPU = torch.cuda.is_available()
+MAX_ROWS = 2000  # Set to e.g. 1000 for quick dev tests
+# 🔧 Sanity checks
+assert os.path.exists(INPUT_FILE), f"❌ File not found: {INPUT_FILE}"
+# 🚀 Load model
+print(f"🧠 Loading model: {MODEL_NAME} {'[GPU]' if USE_GPU else '[CPU]'}")
+model = SentenceTransformer(MODEL_NAME, device="cuda" if USE_GPU else "cpu")
+# 📂 Load data
+print("📂 Reading CSV...")
+df = pd.read_csv(INPUT_FILE)
+assert 'source' in df.columns and 'target' in df.columns, "❌ Missing 'source' or 'target' column!"
+if MAX_ROWS:
+    df = df.head(MAX_ROWS)
+sources = df['source'].fillna("").tolist()
+targets = df['target'].fillna("").tolist()
+# ⏱️ Embed all at once
+def embed_all(texts, label):
+    print(f"⚙️ Embedding {label} ({len(texts)} items)...")
+    start = time.time()
+    embeddings = model.encode(
+        texts,
+        batch_size=BATCH_SIZE,
+        convert_to_tensor=True,
+        normalize_embeddings=True,
+        show_progress_bar=True,
+        device="cuda" if USE_GPU else "cpu",
+        torch_dtype=torch.int8
+    )
+    print(f"✅ {label} embedding done in {time.time() - start:.2f}s")
+    return embeddings
+source_tensor = embed_all(sources, "source")
+target_tensor = embed_all(targets, "target")
+# 💾 Save
+print(f"💾 Saving to {OUTPUT_FILE}...")
+torch.save({"source": source_tensor, "target": target_tensor}, OUTPUT_FILE)
+print(f"✅ Saved {len(sources)} embeddings to {OUTPUT_FILE}")

test_chat.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/usr/bin/env python3
+# chat.py - use trained mapper + decoder interactively
+import torch
+from transformers import T5Tokenizer
+from sentence_transformers import SentenceTransformer
+import torch.nn as nn
+# ===== CONFIG =====
+MAPPER_PTH = "semantic_mapper.pth"
+DECODER_PTH = "embedding_decoder.pth"
+MODEL_NAME = "Snowflake/snowflake-arctic-embed-l-v2.0"
+MAX_LEN = 4096
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ===== LOAD TOKENIZER =====
+tokenizer = T5Tokenizer.from_pretrained("t5-small")
+pad_id = tokenizer.pad_token_id
+eos_id = tokenizer.eos_token_id
+# ===== MODEL CLASSES (same defs as training) =====
+class SemanticMapper(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.net = torch.nn.Sequential(
+            torch.nn.Linear(dim, dim * 2),
+            torch.nn.ReLU(),
+            torch.nn.Linear(dim * 2, dim)
+        )
+    def forward(self, x): return self.net(x)
+class EmbeddingDecoder(nn.Module):
+    def __init__(self, input_dim, hidden_dim, vocab_size, p=0.2):
+        super().__init__()
+        self.bridge = nn.Linear(input_dim, hidden_dim)        # emb -> h0
+        self.embed  = nn.Embedding(vocab_size, hidden_dim)    # token -> hidden
+        self.gru    = nn.GRU(hidden_dim + input_dim, hidden_dim, batch_first=True)
+        self.ln     = nn.LayerNorm(hidden_dim)
+        self.fc     = nn.Linear(hidden_dim, vocab_size, bias=True)
+        self.drop   = nn.Dropout(p)
+        # tie weights
+        self.fc.weight = self.embed.weight
+    @torch.no_grad()
+    def greedy_decode(self, emb_vec, max_len, start_id, eos_id):
+        B, _ = emb_vec.shape
+        h = torch.tanh(self.bridge(emb_vec)).unsqueeze(0)
+        inp = torch.full((B,1), start_id, dtype=torch.long, device=emb_vec.device)
+        out_ids = []
+        for _ in range(max_len):
+            token_h = self.drop(self.embed(inp))               # [B,1,H]
+            step_in = torch.cat([token_h, emb_vec.unsqueeze(1)], dim=-1)
+            out, h = self.gru(step_in, h)
+            out = self.ln(out.squeeze(1))
+            logits = self.fc(self.drop(out))
+            logits[:, pad_id] = -1e9
+            next_id = torch.argmax(logits, dim=-1)
+            out_ids.append(next_id.unsqueeze(1))
+            if (next_id == eos_id).all(): break
+            inp = next_id.unsqueeze(1)
+        return torch.cat(out_ids, dim=1)
+# ===== LOAD MODELS =====
+mapper_ckpt = torch.load(MAPPER_PTH, map_location=DEVICE)
+mapper = SemanticMapper(mapper_ckpt["dim"]).to(DEVICE)
+mapper.load_state_dict(mapper_ckpt["state_dict"])
+mapper.eval()
+dec_ckpt = torch.load(DECODER_PTH, map_location=DEVICE)
+decoder = EmbeddingDecoder(dec_ckpt["dim"], 512, dec_ckpt["vocab_size"]).to(DEVICE)
+decoder.load_state_dict(dec_ckpt["state_dict"])
+decoder.eval()
+embedder = SentenceTransformer(MODEL_NAME, device=DEVICE)
+# ===== CHAT LOOP =====
+def chat():
+    print("Chat ready. Type 'quit' to exit.")
+    while True:
+        user = input("User: ").strip()
+        if not user or user.lower() in {"quit","exit"}: break
+        x = embedder.encode([user], convert_to_tensor=True, device=DEVICE).detach().clone()
+        y_pred = mapper(x)
+        ids = decoder.greedy_decode(y_pred, max_len=MAX_LEN,
+                                    start_id=pad_id, eos_id=eos_id)[0].tolist()
+        reply = tokenizer.decode(ids, skip_special_tokens=True)
+        print("Bot:", reply)
+if __name__ == "__main__":
+    chat()

test_embed.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from sentence_transformers import SentenceTransformer
+import time
+import torch
+model = SentenceTransformer("Snowflake/snowflake-arctic-embed-xs", device="cuda")
+texts = ["The quick brown fox jumps over the lazy dog."] * 1000
+start = time.time()
+embeddings = model.encode(
+    texts,
+    batch_size=512,
+    convert_to_tensor=True,
+    normalize_embeddings=True,
+    device="cuda",
+    torch_dtype=torch.int8
+)
+print(f"⏱️ Embedded 1000 items in {time.time() - start:.2f} seconds")

test_full.py ADDED Viewed

	@@ -0,0 +1,203 @@

+#!/usr/bin/env python3
+import torch, torch.nn as nn, torch.optim as optim
+import pandas as pd
+import matplotlib.pyplot as plt
+from transformers import T5Tokenizer
+from sentence_transformers import SentenceTransformer
+# ==== Config ====
+EMB_FILE   = "chat_embeddings.pt"            # {"source": [N,D], "target": [N,D]}
+CSV_FILE   = "chat_1turn.csv"                # columns: source, target
+MODEL_NAME = "Snowflake/snowflake-arctic-embed-l-v2.0"
+EPOCHS_MAPPER  = 20
+EPOCHS_DECODER = 160
+BATCH_SIZE_MAP = 64
+BATCH_SIZE_DEC = 64
+LR_MAPPER      = 1e-3
+LR_DECODER     = 1e-3
+HIDDEN_DIM     = 512
+MAX_LEN        = 64
+PLOT_LOSS      = False
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# ==== Load embeddings & CSV ====
+emb = torch.load(EMB_FILE, map_location=device)
+x_embeddings = emb["source"].to(device)  # [N,D]
+y_embeddings = emb["target"].to(device)  # [N,D]
+N, D = x_embeddings.shape
+print(f"Loaded embeddings: N={N}, D={D}")
+df = pd.read_csv(CSV_FILE)
+assert "target" in df.columns
+targets = df["target"].fillna("").tolist()
+# ==== Mapper: x_emb -> y_emb ====
+class SemanticMapper(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim*2), nn.ReLU(),
+            nn.Linear(dim*2, dim)
+        )
+    def forward(self, x): return self.net(x)
+mapper = SemanticMapper(D).to(device)
+opt_map = optim.Adam(mapper.parameters(), lr=LR_MAPPER)
+crit_map = nn.CosineEmbeddingLoss()
+print("\nTraining mapper...")
+map_losses = []
+for ep in range(EPOCHS_MAPPER):
+    perm = torch.randperm(N, device=device)
+    total = 0.0; steps = 0
+    for i in range(0, N, BATCH_SIZE_MAP):
+        idx = perm[i:i+BATCH_SIZE_MAP]
+        xb, yb = x_embeddings[idx], y_embeddings[idx]
+        tgt = torch.ones(xb.size(0), device=device)
+        pred = mapper(xb)
+        loss = crit_map(pred, yb, tgt)
+        opt_map.zero_grad(); loss.backward()
+        opt_map.step()
+        total += loss.item(); steps += 1
+    avg = total / max(1, steps)
+    map_losses.append(avg)
+    print(f"Mapper Epoch {ep+1}/{EPOCHS_MAPPER} - Loss: {avg:.6f}")
+if PLOT_LOSS:
+    plt.figure(); plt.plot(map_losses, marker="o"); plt.title("Mapper Loss"); plt.grid(True); plt.show()
+torch.save({"state_dict": mapper.state_dict(), "dim": D}, "semantic_mapper.pth")
+print("Saved mapper -> semantic_mapper.pth")
+# ==== Decoder: y_emb -> target text ====
+tokenizer = T5Tokenizer.from_pretrained("t5-small")
+tok = tokenizer(targets, padding=True, truncation=True, max_length=MAX_LEN,
+                return_tensors="pt", add_special_tokens=True)
+labels = tok["input_ids"].to(device)  # [N,L]
+pad_id = tokenizer.pad_token_id
+eos_id = tokenizer.eos_token_id      # T5 uses </s> as EOS
+# Build shifted inputs for strict teacher forcing:
+# y_in[0] = BOS (use pad_id for T5), then y_in[t] = labels[t-1]
+y_in = torch.full_like(labels, pad_id)
+y_in[:, 1:] = labels[:, :-1]
+y_out = labels  # predict labels[t] given y_in[t]
+class EmbeddingDecoder(nn.Module):
+    """
+    Strong conditioning: concat emb each step.
+    Weight tying: embed.weight = fc.weight.
+    Deterministic teacher forcing via pre-built y_in (no ratios).
+    """
+    def __init__(self, input_dim, hidden_dim, vocab_size, p=0.2):
+        super().__init__()
+        self.bridge = nn.Linear(input_dim, hidden_dim)        # emb -> h0
+        self.embed  = nn.Embedding(vocab_size, hidden_dim)    # token -> hidden
+        self.gru    = nn.GRU(hidden_dim + input_dim, hidden_dim, batch_first=True)
+        self.ln     = nn.LayerNorm(hidden_dim)
+        self.fc     = nn.Linear(hidden_dim, vocab_size, bias=True)
+        self.drop   = nn.Dropout(p)
+        # Tie weights
+        self.fc.weight = self.embed.weight
+    def forward_teacher_forced(self, emb_vec, in_ids, max_len):
+        """
+        emb_vec: [B,D], in_ids: [B,L] (strict teacher forcing inputs)
+        Returns logits: [B,L,V]
+        """
+        B, D_in = emb_vec.shape
+        H0 = torch.tanh(self.bridge(emb_vec)).unsqueeze(0)     # [1,B,H]
+        logits_all = []
+        h = H0
+        for t in range(max_len):
+            inp = in_ids[:, t].unsqueeze(1)                    # [B,1]
+            token_h = self.drop(self.embed(inp))               # [B,1,H]
+            step_in = torch.cat([token_h, emb_vec.unsqueeze(1)], dim=-1)  # [B,1,H+D]
+            out, h  = self.gru(step_in, h)                    # [B,1,H]
+            out = self.ln(out.squeeze(1))                     # [B,H]
+            logits = self.fc(self.drop(out))                  # [B,V]
+            logits_all.append(logits.unsqueeze(1))
+        return torch.cat(logits_all, dim=1)                    # [B,L,V]
+    @torch.no_grad()
+    def greedy_decode(self, emb_vec, max_len, start_id, eos_id):
+        """
+        Pure greedy with EOS stop; forbids PAD to reduce loops.
+        """
+        B, _ = emb_vec.shape
+        h = torch.tanh(self.bridge(emb_vec)).unsqueeze(0)
+        inp = torch.full((B,1), start_id, dtype=torch.long, device=emb_vec.device)
+        out_ids = []
+        done = torch.zeros(B, dtype=torch.bool, device=emb_vec.device)
+        for _ in range(max_len):
+            token_h = self.embed(inp)                          # [B,1,H]
+            step_in = torch.cat([token_h, emb_vec.unsqueeze(1)], dim=-1)
+            out, h = self.gru(step_in, h)
+            logits = self.fc(out.squeeze(1))                   # [B,V]
+            logits[:, pad_id] = -1e9                           # discourage PAD
+            next_id = torch.argmax(logits, dim=-1)             # [B]
+            out_ids.append(next_id.unsqueeze(1))
+            done |= (next_id == eos_id)
+            if done.all(): break
+            inp = next_id.unsqueeze(1)
+        return torch.cat(out_ids, dim=1)                       # [B,T]
+decoder = EmbeddingDecoder(D, HIDDEN_DIM, tokenizer.vocab_size).to(device)
+opt_dec = optim.Adam(decoder.parameters(), lr=LR_DECODER)
+crit_dec = nn.CrossEntropyLoss(ignore_index=pad_id)  # no smoothing (small N)
+print("\nTraining decoder...")
+dec_losses = []
+steps = (N + BATCH_SIZE_DEC - 1) // BATCH_SIZE_DEC
+for ep in range(EPOCHS_DECODER):
+    perm = torch.randperm(N, device=device)
+    total = 0.0
+    for i in range(0, N, BATCH_SIZE_DEC):
+        idx = perm[i:i+BATCH_SIZE_DEC]
+        eb   = y_embeddings[idx]       # condition on TRUE target-space embeddings
+        yin  = y_in[idx]               # shifted inputs
+        yout = y_out[idx]              # labels
+        opt_dec.zero_grad()
+        logits = decoder.forward_teacher_forced(eb, yin, max_len=yout.size(1))  # [B,L,V]
+        loss = crit_dec(logits.reshape(-1, logits.size(-1)), yout.reshape(-1))
+        loss.backward()
+        nn.utils.clip_grad_norm_(decoder.parameters(), 1.0)
+        opt_dec.step()
+        total += loss.item()
+    avg = total / max(1, steps)
+    dec_losses.append(avg)
+    print(f"Decoder Epoch {ep+1}/{EPOCHS_DECODER} - Loss: {avg:.4f}")
+if PLOT_LOSS:
+    plt.figure(); plt.plot(dec_losses, marker="o"); plt.title("Decoder Loss"); plt.grid(True); plt.show()
+torch.save({"state_dict": decoder.state_dict(), "dim": D, "vocab_size": tokenizer.vocab_size},
+           "embedding_decoder.pth")
+print("Saved decoder -> embedding_decoder.pth")
+# ==== E2E inference ====
+embedder = SentenceTransformer(MODEL_NAME, device=device)
+try:
+    dim = embedder.get_sentence_embedding_dimension()
+    if dim != D:
+        raise RuntimeError(f"Embedder dim {dim} != training dim {D}. Regenerate embeddings with same MODEL_NAME.")
+except Exception:
+    pass
+@torch.no_grad()
+def generate(text: str, max_len: int = 24) -> str:
+    # source -> x_emb
+    x = embedder.encode([text], convert_to_tensor=True, device=device)  # [1,D]
+    # map -> y_emb
+    y_pred = mapper(x)  # [1,D]
+    # decode y_emb -> text
+    ids = decoder.greedy_decode(y_pred, max_len=max_len, start_id=pad_id, eos_id=eos_id)[0].tolist()
+    return tokenizer.decode(ids, skip_special_tokens=True)
+print("\nE2E test:")
+inp = "User: Hi"
+print(f"{inp} ->", generate(inp))

testcuda.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import torch
+print(torch.cuda.is_available())
+print(torch.cuda.device_count())
+print(torch.cuda.current_device())
+print(torch.cuda.get_device_name(0))