Upload 5 files

Browse files

Files changed (5) hide show

config.json +29 -0
dataset_space_time.py +56 -0
inference.py +96 -0
model.safetensors +3 -0
modeling_custom_minilm.py +205 -0

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "SpaceTimeMiniLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "num_space": 4,
+  "num_time": 60,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "use_space_embedding": true,
+  "use_time_embedding": true,
+  "vocab_size": 30522
+}

dataset_space_time.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer, DataCollatorForLanguageModeling
+SEQ_LEN, BATCH_SIZE = 128, 32
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+space_mapping = {'UK': 0, 'US': 1, 'AUS': 2, 'CAN': 3}
+time_mapping = {
+    f"{year}-{month:02d}": i
+    for i, (year, month) in enumerate(
+        [(y, m) for y in range(2017, 2022 + 1) for m in range(1, 13)]
+    )
+    if i < 60
+}
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
+)
+class PairwiseSimilarityDataset(Dataset):
+    def __init__(self, df):
+        self.df = df.reset_index(drop=True)
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        row = self.df.loc[idx]
+        return {
+            "sent1": row.sent1,
+            "sent2": row.sent2,
+            "t1":    time_mapping[row.t1],
+            "t2":    time_mapping[row.t2],
+            "s1":    space_mapping[row.s1],
+            "s2":    space_mapping[row.s2],
+            "sim":    row.similarity
+        }
+def collate_fn(batch):
+    texts = [b["sent1"] for b in batch] + [b["sent2"] for b in batch]
+    enc = tokenizer(
+        texts,
+        padding="longest",
+        truncation=True,
+        max_length=128,
+        return_tensors="pt"
+    )
+    B = len(batch)
+    t1   = torch.tensor([b["t1"] for b in batch], dtype=torch.long)
+    t2   = torch.tensor([b["t2"] for b in batch], dtype=torch.long)
+    s1   = torch.tensor([b["s1"] for b in batch], dtype=torch.long)
+    s2   = torch.tensor([b["s2"] for b in batch], dtype=torch.long)
+    sims = torch.tensor([b["sim"] for b in batch], dtype=torch.float)
+    return enc, B, s1, s2, t1, t2, sims

inference.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+import torch.nn.functional as F
+space_mapping = {'UK': 0, 'US': 1, 'AUS': 2, 'CAN': 3}
+time_mapping = {
+    f"{year}-{month:02d}": i
+    for i, (year, month) in enumerate(
+        [(y, m) for y in range(2017, 2022 + 1) for m in range(1, 13)]
+    )
+    if i < 60
+}
+def compute_similarity(
+    sent1: str,
+    sent2: str,
+    time1: str,
+    time2: str,
+    space1: str,
+    space2: str,
+    model: None,
+    tokenizer,
+    device="cuda"
+) -> float:
+    device = torch.device(device if torch.cuda.is_available() else "cpu")
+    model = model.to(device).eval()
+    enc1 = tokenizer(
+        sent1,
+        padding="max_length",
+        truncation=True,
+        max_length=128,
+        return_tensors="pt"
+    ).to(device)
+    enc2 = tokenizer(
+        sent2,
+        padding="max_length",
+        truncation=True,
+        max_length=128,
+        return_tensors="pt"
+    ).to(device)
+    space1 = space_mapping[space1]
+    space2 = space_mapping[space2]
+    time1 = time_mapping[time1]
+    time2 = time_mapping[time2]
+    s1 = torch.tensor([space1], dtype=torch.long, device=device)
+    t1 = torch.tensor([time1],  dtype=torch.long, device=device)
+    s2 = torch.tensor([space2], dtype=torch.long, device=device)
+    t2 = torch.tensor([time2],  dtype=torch.long, device=device)
+    with torch.no_grad():
+        emb1 = model.embed(
+            enc1["input_ids"],
+            enc1["attention_mask"],
+            s1, t1
+        )
+        emb2 = model.embed(
+            enc2["input_ids"],
+            enc2["attention_mask"],
+            s2, t2
+        )
+    sim = F.cosine_similarity(emb1, emb2, dim=-1)
+    return sim.item()
+def embed_sentence(
+    sent: str,
+    time: str,
+    space: str,
+    model: None,
+    tokenizer,
+    device="cuda"
+) -> torch.Tensor:
+    device = torch.device(device if torch.cuda.is_available() else "cpu")
+    model = model.to(device).eval()
+    enc = tokenizer(
+        sent,
+        padding="max_length",
+        truncation=True,
+        max_length=128,
+        return_tensors="pt"
+    ).to(device)
+    space = space_mapping[space]
+    time = time_mapping[time]
+    s = torch.tensor([space], dtype=torch.long, device=device)
+    t = torch.tensor([time],  dtype=torch.long, device=device)
+    with torch.no_grad():
+        emb = model.embed(
+            enc["input_ids"],
+            enc["attention_mask"],
+            s, t
+        )
+    return emb

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce31c3a4bd9044584c2a2c058a548ecc8dfebc3397a6ea3dd078ea347b99a6f8
+size 145165416

modeling_custom_minilm.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import torch
+import torch.nn as nn
+import math
+from transformers import AutoModel, AutoTokenizer, AutoConfig
+from transformers import PreTrainedModel
+from transformers.models.bert.modeling_bert import BertSelfAttention
+class SpaceEmbedding(nn.Module):
+    def __init__(self, num_embeddings=4, embedding_dim=384):
+        super().__init__()
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
+    def forward(self, x):
+        return self.embedding(x)
+class TimeEmbedding(nn.Module):
+    def __init__(self, max_months, dim=384):
+        super().__init__()
+        self.dim = dim
+        pe = torch.zeros(max_months, dim)
+        pos = torch.arange(0, max_months).unsqueeze(1)
+        i = torch.arange(0, dim, 2)
+        pe[:, 0::2] = torch.sin(pos / (10000 ** (2*i/dim)))
+        pe[:, 1::2] = torch.cos(pos / (10000 ** (2*i/dim)))
+        self.register_buffer("pe", pe)
+    def forward(self, idx):
+        return self.pe[idx]
+# ----------------------------
+# 1) Custom Space–Time Attention
+# ----------------------------
+class SpaceTimeSelfAttention(nn.Module):
+    def __init__(self, orig_self: BertSelfAttention, config):
+        super().__init__()
+        self.orig = orig_self
+        self.config = config
+        self.W_t = nn.Linear(config.hidden_size, config.hidden_size)
+        self.W_s = nn.Linear(config.hidden_size, config.hidden_size)
+    def transpose_for_scores(self, x):
+        return self.orig.transpose_for_scores(x)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        time_embeddings=None,
+        space_embeddings=None,
+    ):
+        mixed_q = self.orig.query(hidden_states)
+        mixed_k = self.orig.key(hidden_states)
+        mixed_v = self.orig.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_q)
+        key_layer   = self.transpose_for_scores(mixed_k)
+        value_layer = self.transpose_for_scores(mixed_v)
+        T = self.W_t(time_embeddings)
+        S = self.W_s(space_embeddings)
+        T_layer = self.transpose_for_scores(T)
+        S_layer = self.transpose_for_scores(S)
+        base_scores = torch.matmul(
+            query_layer,
+            key_layer.transpose(-1, -2)
+        )
+        eps = 1e-6
+        T_norm = T_layer.norm(dim=-1, keepdim=True)
+        time_sim = torch.matmul(
+            T_layer,
+            T_layer.transpose(-1, -2)
+        ) / (T_norm + eps)
+        S_norm = S_layer.norm(dim=-1, keepdim=True)
+        space_sim = torch.matmul(
+            S_layer,
+            S_layer.transpose(-1, -2)
+        ) / (S_norm + eps)
+        attn_scores = base_scores * time_sim * space_sim
+        dk = self.config.hidden_size // self.config.num_attention_heads
+        attn_scores = attn_scores / math.sqrt(dk)
+        if attention_mask is not None:
+            attn_scores = attn_scores + attention_mask
+        attn_probs = nn.Softmax(dim=-1)(attn_scores)
+        attn_probs = self.orig.dropout(attn_probs)
+        if head_mask is not None:
+            attn_probs = attn_probs * head_mask
+        context = torch.matmul(attn_probs, value_layer)
+        context = context.permute(0, 2, 1, 3).contiguous()
+        new_shape = context.size()[:-2] + (self.config.hidden_size,)
+        context = context.view(*new_shape)
+        if output_attentions:
+            return (context, attn_probs)
+        return context
+# ----------------------------
+# 2) Full Space–Time–MiniLM Model
+# ----------------------------
+class SpaceTimeMiniLM(PreTrainedModel):
+    config_class = AutoConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.base = AutoModel.from_config(config)
+        self.config = config
+        for layer in self.base.encoder.layer:
+            orig_self = layer.attention.self
+            layer.attention.self = SpaceTimeSelfAttention(orig_self, self.config)
+        self.space_embed = SpaceEmbedding(num_embeddings=config.num_space,
+                                          embedding_dim=self.config.hidden_size)
+        self.time_embed  = TimeEmbedding(max_months=config.num_time,
+                                         dim=self.config.hidden_size)
+        self.mlm_head   = nn.Linear(self.config.hidden_size,
+                                    config.vocab_size)
+        self.space_head = nn.Linear(self.config.hidden_size, config.num_space)
+        self.time_head  = nn.Linear(self.config.hidden_size, config.num_time)
+    def forward(self, input_ids, attention_mask, space_ids, time_ids):
+        B, L = input_ids.size()
+        extended_mask = self.base.get_extended_attention_mask(attention_mask, (B, L), device=input_ids.device)
+        emb = self.base.embeddings(input_ids)
+        S = self.space_embed(space_ids)
+        T = self.time_embed(time_ids)
+        S = S.unsqueeze(1).expand(-1, L, -1)
+        T = T.unsqueeze(1).expand(-1, L, -1)
+        hidden_states = emb
+        for layer in self.base.encoder.layer:
+            attn_out = layer.attention.self(
+                hidden_states,
+                attention_mask=extended_mask,
+                head_mask=None,
+                output_attentions=False,
+                time_embeddings=T,
+                space_embeddings=S
+            )
+            attn_out = layer.attention.output(attn_out, hidden_states)
+            interm = layer.intermediate(attn_out)
+            hidden_states = layer.output(interm, attn_out)
+        sequence_output = hidden_states
+        pooled_output   = self.base.pooler(sequence_output)
+        mlm_logits   = self.mlm_head(sequence_output)
+        space_logits = self.space_head(pooled_output)
+        time_logits  = self.time_head(pooled_output)
+        return mlm_logits, space_logits, time_logits
+    def embed(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor,
+        space_ids: torch.LongTensor,
+        time_ids:  torch.LongTensor
+    ) -> torch.FloatTensor:
+        B, L = input_ids.size()
+        extended_mask = self.base.get_extended_attention_mask(
+            attention_mask, (B, L), device=input_ids.device
+        )
+        hidden_states = self.base.embeddings(input_ids)
+        S = self.space_embed(space_ids)
+        T = self.time_embed(time_ids)
+        S = S.unsqueeze(1).expand(-1, L, -1)
+        T = T.unsqueeze(1).expand(-1, L, -1)
+        for layer in self.base.encoder.layer:
+            attn_out = layer.attention.self(
+                hidden_states,
+                attention_mask=extended_mask,
+                head_mask=None,
+                output_attentions=False,
+                time_embeddings=T,
+                space_embeddings=S
+            )
+            attn_out = layer.attention.output(attn_out, hidden_states)
+            interm = layer.intermediate(attn_out)
+            hidden_states = layer.output(interm, attn_out)
+        mask_exp = attention_mask.unsqueeze(-1).expand_as(hidden_states).float()
+        sum_emb  = torch.sum(hidden_states * mask_exp, dim=1)
+        sum_mask = mask_exp.sum(dim=1).clamp(min=1e-9)
+        pooled   = sum_emb / sum_mask
+        return pooled