import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# ===== Load precomputed embeddings =====
emb_data = torch.load("chat_embeddings.pt")

x_embeddings = emb_data["source"]  # [N, D]
y_embeddings = emb_data["target"]  # [N, D]

print("Source shape:", x_embeddings.shape)
print("Target shape:", y_embeddings.shape)

embedding_dim = x_embeddings.shape[1]
num_samples = x_embeddings.shape[0]

# ===== Device =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

x_embeddings = x_embeddings.to(device)
y_embeddings = y_embeddings.to(device)

# ===== Define model =====
class SemanticMapper(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * 2),
            nn.ReLU(),
            nn.Linear(dim * 2, dim)
        )

    def forward(self, x):
        return self.net(x)

model = SemanticMapper(embedding_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CosineEmbeddingLoss()

# ===== Training config =====
epochs = 20
batch_size = 64
loss_history = []

# ===== Training loop =====
for epoch in range(epochs):
    perm = torch.randperm(num_samples, device=device)
    epoch_loss = 0.0
    for i in range(0, num_samples, batch_size):
        idx = perm[i:i + batch_size]
        x_batch = x_embeddings[idx]
        y_batch = y_embeddings[idx]
        target = torch.ones(x_batch.size(0), device=device)  # cosine target = +1

        y_pred = model(x_batch)
        loss = criterion(y_pred, y_batch, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / (num_samples / batch_size)
    loss_history.append(avg_loss)
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.6f}")

# ===== Plot loss curve =====
plt.plot(loss_history, marker="o")
plt.title("Training Loss (Cosine Similarity)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.show()

# Save the trained model
torch.save(model.state_dict(), "semantic_mapper.pth")