Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

distilbert_best.pth +3 -0
load.py +7 -0
model.py +149 -0

distilbert_best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6a11bf0b621c366ec36342d15e17964c3bf060ebf0ab7de53012e909baf89ae
+size 271071638

load.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torch
+from model import FakeBERT
+model = FakeBERT(model_name=MODEL_NAME, num_classes=NUM_CLASSES).to(DEVICE)
+state_dict = torch.load(MODEL_PATH, map_location=DEVICE)
+model.load_state_dict(state_dict)

model.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel
+# -------------------------------
+# 1. Model Definition
+# -------------------------------
+class FakeBERT(nn.Module):
+    def __init__(self, model_name="bert-base-uncased", num_classes=3, dropout=0.2):
+        super().__init__()
+        # Base transformer model (AutoModel is future-proof)
+        self.bert = AutoModel.from_pretrained(model_name)
+        hidden = self.bert.config.hidden_size
+        out_channels = 128
+        # Parallel 1D convs across token dimension (in_channels = hidden)
+        self.conv1 = nn.Conv1d(hidden, out_channels, kernel_size=3, padding='same')
+        self.conv2 = nn.Conv1d(hidden, out_channels, kernel_size=4, padding='same')
+        self.conv3 = nn.Conv1d(hidden, out_channels, kernel_size=5, padding='same')
+        # Post-concatenation conv layers operate on concatenated channels
+        self.conv_post1 = nn.Conv1d(out_channels * 3, out_channels, kernel_size=3, padding=1)
+        self.conv_post2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1)
+        # We'll apply a final adaptive pooling to length 1 -> deterministic flattened size = out_channels
+        self.final_pool_size = 1
+        # Fully connected layers (in_features = out_channels after final global pool)
+        self.fc1 = nn.Linear(out_channels, 128)
+        self.dropout = nn.Dropout(dropout)
+        self.fc2 = nn.Linear(128, num_classes)
+        self.relu = nn.ReLU()
+        # Whether the backbone expects token_type_ids (some models like bert do, distilbert does not)
+        # Use model config if available; fallback: assume not present
+        self._accepts_token_type_ids = getattr(self.bert.config, "type_vocab_size", None) is not None
+    def _forward_transformer(self, input_ids, attention_mask=None, token_type_ids=None):
+        """
+        Handles both short and long sequences by chunking if needed.
+        Returns last_hidden_state shaped (B, seq_len, hidden)
+        """
+        B, L = input_ids.size()
+        max_len = getattr(self.bert.config, "max_position_embeddings", 512)
+        # Helper to build kwargs robustly
+        def build_kwargs(ii, am=None, tt=None):
+            kwargs = {"input_ids": ii}
+            if am is not None:
+                kwargs["attention_mask"] = am
+            if tt is not None and self._accepts_token_type_ids:
+                kwargs["token_type_ids"] = tt
+            return kwargs
+        # --- Fast path: short sequence ---
+        if L <= max_len:
+            kwargs = build_kwargs(input_ids, attention_mask, token_type_ids)
+            return self.bert(**kwargs).last_hidden_state  # (B, seq_len, hidden)
+        # --- Long input: chunk and recombine ---
+        chunks, masks, types = [], [], []
+        for start in range(0, L, max_len):
+            end = min(start + max_len, L)
+            chunks.append(input_ids[:, start:end])
+            if attention_mask is not None:
+                masks.append(attention_mask[:, start:end])
+            if token_type_ids is not None:
+                types.append(token_type_ids[:, start:end])
+        # Pad chunks to equal length (minimal padding)
+        chunk_lens = [c.size(1) for c in chunks]
+        max_chunk_len = max(chunk_lens)
+        device = input_ids.device
+        padded_chunks = []
+        padded_masks = [] if masks else None
+        padded_types = [] if types else None
+        for i, c in enumerate(chunks):
+            pad_len = max_chunk_len - c.size(1)
+            if pad_len > 0:
+                pad_ids = torch.zeros(B, pad_len, dtype=c.dtype, device=device)
+                c = torch.cat([c, pad_ids], dim=1)
+            padded_chunks.append(c)
+            if masks:
+                m = masks[i]
+                if pad_len > 0:
+                    pad_m = torch.zeros(B, pad_len, dtype=m.dtype, device=device)
+                    m = torch.cat([m, pad_m], dim=1)
+                padded_masks.append(m)
+            if types:
+                t = types[i]
+                if pad_len > 0:
+                    pad_t = torch.zeros(B, pad_len, dtype=t.dtype, device=device)
+                    t = torch.cat([t, pad_t], dim=1)
+                padded_types.append(t)
+        # Batch all chunks together for a single forward pass
+        input_chunks = torch.cat(padded_chunks, dim=0)  # (B * n_chunks, chunk_len)
+        attention_chunks = torch.cat(padded_masks, dim=0) if padded_masks is not None else None
+        token_chunks = torch.cat(padded_types, dim=0) if padded_types is not None else None
+        kwargs = build_kwargs(input_chunks, attention_chunks, token_chunks)
+        x_all = self.bert(**kwargs).last_hidden_state  # (B * n_chunks, chunk_len, hidden)
+        # recombine: x_all stacked as [chunk0_batch; chunk1_batch; ...], so recombine per original batch
+        n_chunks = len(chunks)
+        # split x_all into list of length n_chunks each of shape (B, chunk_len, hidden)
+        split = torch.split(x_all, input_chunks.size(0) // n_chunks, dim=0)
+        # concatenate along token dimension
+        x = torch.cat(list(split), dim=1)  # (B, total_seq_len, hidden)
+        return x
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
+        # Transformer forward (handles chunking)
+        x = self._forward_transformer(input_ids, attention_mask, token_type_ids)  # (B, seq_len, hidden)
+        # --- Convolutional feature extraction ---
+        x = x.transpose(1, 2)  # (B, hidden, seq_len)
+        seq_len = x.size(2)
+        # Parallel conv + relu
+        c1 = self.relu(self.conv1(x))
+        c2 = self.relu(self.conv2(x))
+        c3 = self.relu(self.conv3(x))
+        # Ensure same seq_len for concat (padding in convs keeps lengths equal due to padding)
+        x = torch.cat([c1, c2, c3], dim=1)  # (B, 3*out_channels, seq_len)
+        # Post convs
+        x = self.relu(self.conv_post1(x))
+        x = self.relu(self.conv_post2(x))
+        # Final adaptive global pooling to fixed length 1
+        x = F.adaptive_max_pool1d(x, self.final_pool_size)  # (B, out_channels, 1)
+        x = x.squeeze(-1)  # (B, out_channels)
+        # Fully connected head
+        x = self.relu(self.fc1(x))
+        x = self.dropout(x)
+        logits = self.fc2(x)
+        return logits