Spaces:
Sleeping
Sleeping
jl commited on
Commit ·
ee0d322
1
Parent(s): deb2196
fix: api leak will depricate api keys
Browse files- notebooks/thesis-proposed-model-vo3.ipynb +1 -0
- pyproject.toml +1 -0
- src/app.py +6 -0
- src/hatespeech_model.py +10 -40
- uv.lock +11 -0
notebooks/thesis-proposed-model-vo3.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.12.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[{"sourceType":"datasetVersion","sourceId":14919503,"datasetId":9546304,"databundleVersionId":15785969}],"dockerImageVersionId":31286,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.utils.data import DataLoader, Dataset\nfrom torch.optim import AdamW\nfrom sklearn.model_selection import train_test_split\nfrom transformers import AutoModel, AutoTokenizer\n\nimport numpy as np\nimport pandas as pd\nimport time\nimport os\nfrom tqdm.auto import tqdm\n\nfrom sklearn.metrics import (\n accuracy_score,\n precision_recall_fscore_support,\n classification_report,\n confusion_matrix\n)\n\nimport matplotlib.pyplot as plt\nimport seaborn as sns","metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class AdditionalCustomDataset(Dataset):\n \"\"\"\n Pre-tokenizes all data during initialization for much faster training.\n \"\"\"\n def __init__(self, texts, labels, additional_texts, tokenizer, bert_tokenizer, max_length):\n self.labels = labels\n self.max_length = max_length\n \n # Pre-tokenize ALL data once during initialization (MUCH faster than tokenizing in __getitem__)\n print(\"Pre-tokenizing primary texts...\")\n primary_encodings = tokenizer(\n texts,\n max_length=max_length,\n truncation=True,\n padding='max_length',\n return_tensors='pt'\n )\n self.primary_input_ids = primary_encodings['input_ids']\n self.primary_attention_mask = primary_encodings['attention_mask']\n \n print(\"Pre-tokenizing additional texts...\")\n additional_encodings = bert_tokenizer(\n additional_texts,\n max_length=max_length,\n truncation=True,\n padding='max_length',\n return_tensors='pt'\n )\n self.additional_input_ids = additional_encodings['input_ids']\n self.additional_attention_mask = additional_encodings['attention_mask']\n \n print(f\"Pre-tokenization complete. Dataset size: {len(self.labels)}\")\n\n def __len__(self):\n return len(self.labels)\n\n def __getitem__(self, idx):\n return (\n self.primary_input_ids[idx],\n self.primary_attention_mask[idx],\n self.additional_input_ids[idx],\n self.additional_attention_mask[idx],\n self.labels[idx]\n )","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class ProjectionMLP(nn.Module):\n def __init__(self, input_size, output_size):\n super(ProjectionMLP, self).__init__()\n self.layers = nn.Sequential(\n nn.Linear(input_size, output_size),\n nn.ReLU(),\n nn.Linear(output_size, 2)\n )\n\n def forward(self, x):\n return self.layers(x)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class GumbelTokenSelector(nn.Module):\n def __init__(self, hidden_size, tau=1.0):\n super().__init__()\n self.tau = tau\n self.proj = nn.Linear(hidden_size * 2, 1)\n \n def forward(self, token_embeddings, cls_embedding, training=True):\n \"\"\"\n token_embeddings: (B, L, H)\n cls_embedding: (B, H)\n \"\"\"\n B, L, H = token_embeddings.size()\n \n cls_exp = cls_embedding.unsqueeze(1).expand(-1, L, -1)\n x = torch.cat([token_embeddings, cls_exp], dim=-1)\n \n logits = self.proj(x).squeeze(-1) # (B, L)\n \n if training:\n probs = F.gumbel_softmax(\n torch.stack([logits, torch.zeros_like(logits)], dim=-1),\n tau=self.tau,\n hard=False\n )[..., 0]\n else:\n probs = torch.sigmoid(logits)\n \n return probs, logits","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class MultiScaleAttentionCNN(nn.Module):\n def __init__(\n self,\n hidden_size=768,\n num_filters=128,\n kernel_sizes=(2, 3, 4),\n dropout=0.3,\n ):\n super().__init__()\n \n self.convs = nn.ModuleList([\n nn.Conv1d(hidden_size, num_filters, k)\n for k in kernel_sizes\n ])\n \n self.attention_fc = nn.Linear(num_filters, 1)\n self.dropout = nn.Dropout(dropout)\n self.out_dim = num_filters * len(kernel_sizes)\n \n def forward(self, x, mask):\n \"\"\"\n x: (B, L, H)\n mask: (B, L)\n \"\"\"\n x = x.transpose(1, 2) # (B, H, L)\n feats = []\n \n for conv in self.convs:\n h = F.relu(conv(x)) # (B, C, L')\n h = h.transpose(1, 2) # (B, L', C)\n \n attn = self.attention_fc(h).squeeze(-1)\n attn = attn.masked_fill(mask[:, :attn.size(1)] == 0, -1e9)\n alpha = F.softmax(attn, dim=1)\n \n pooled = torch.sum(h * alpha.unsqueeze(-1), dim=1)\n feats.append(pooled)\n \n out = torch.cat(feats, dim=1)\n return self.dropout(out)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class TemporalCNN(nn.Module):\n def __init__(\n self,\n hidden_size=768,\n num_filters=128,\n kernel_sizes=(2, 3, 4),\n dropout=0.1,\n dilation_base=2,\n ):\n super().__init__()\n \n self.kernel_sizes = kernel_sizes\n self.dilation_base = dilation_base\n \n # Dilated convolutions with exponentially increasing dilation rates\n self.convs = nn.ModuleList([\n nn.Conv1d(\n hidden_size, \n num_filters, \n k,\n dilation=dilation_base ** i, # dilation = 2^i\n padding=0 # we'll handle causal padding manually\n )\n for i, k in enumerate(kernel_sizes)\n ])\n \n self.dropout = nn.Dropout(dropout)\n self.out_dim = num_filters * len(kernel_sizes)\n \n def _causal_padding(self, x, kernel_size, dilation):\n \"\"\"\n Apply left padding only (causal) to ensure output at time t\n only depends on inputs from time 0 to t.\n \"\"\"\n # Calculate required padding: (kernel_size - 1) * dilation\n padding = (kernel_size - 1) * dilation\n # Pad only on the left (temporal dimension)\n return F.pad(x, (padding, 0))\n \n def forward(self, x, attention_mask):\n \"\"\"\n x: (B, L, H)\n attention_mask: (B, L)\n \"\"\"\n # zero-out padding tokens\n mask = attention_mask.unsqueeze(-1)\n x = x * mask\n \n # (B, H, L) for Conv1d\n x = x.transpose(1, 2)\n \n feats = []\n for i, conv in enumerate(self.convs):\n kernel_size = self.kernel_sizes[i]\n dilation = self.dilation_base ** i\n \n # Apply causal padding (left-only)\n x_padded = self._causal_padding(x, kernel_size, dilation)\n \n # Apply dilated convolution\n c = F.relu(conv(x_padded)) # (B, C, L')\n \n # Global max pooling over the temporal dimension\n p = F.max_pool1d(c, kernel_size=c.size(2)).squeeze(2) # (B, C)\n \n feats.append(p)\n \n out = torch.cat(feats, dim=1) # (B, num_filters * len(kernel_sizes))\n return self.dropout(out)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class ConcatModel(nn.Module):\n def __init__(\n self,\n hatebert_model,\n additional_model,\n temporal_cnn,\n msa_cnn,\n selector,\n projection_mlp,\n unfreeze_n_layers_hate=12, #hatebert unfreeze all 12 layers\n unfreeze_n_layers_add=0, # additional bert freeze all layers\n hate_pooler=True,\n add_pooler=False\n \n ):\n super().__init__()\n \n self.hatebert_model = hatebert_model\n self.additional_model = additional_model\n \n self.temporal_cnn = temporal_cnn\n self.msa_cnn = msa_cnn\n self.selector = selector\n self.projection_mlp = projection_mlp\n \n # freeze everything on additional bert\n for p in self.additional_model.parameters():\n p.requires_grad = False\n\n # will unfreeze last n layers of additional model\n add_num_layers = len(self.additional_model.encoder.layer)\n for i in range(add_num_layers - unfreeze_n_layers_add, add_num_layers):\n for param in self.additional_model.encoder.layer[i].parameters():\n param.requires_grad = True\n\n if self.additional_model.pooler is not None and add_pooler:\n for p in self.additional_model.pooler.parameters():\n p.requires_grad = True\n\n # freeze everything on hatebert\n for p in self.hatebert_model.parameters():\n p.requires_grad = False\n \n # FIX: Use hatebert_model to get the correct number of layers\n hate_num_layers = len(self.hatebert_model.encoder.layer)\n for i in range(hate_num_layers - unfreeze_n_layers_hate, hate_num_layers):\n for param in self.hatebert_model.encoder.layer[i].parameters():\n param.requires_grad = True\n \n if self.hatebert_model.pooler is not None and hate_pooler:\n for p in self.hatebert_model.pooler.parameters():\n p.requires_grad = True\n\n \n def forward(self, input_ids, attention_mask, additional_input_ids, additional_attention_mask):\n # ================= HateBERT =================\n hate_outputs = self.hatebert_model(\n input_ids=input_ids,\n attention_mask=attention_mask,\n )\n seq_emb = hate_outputs.last_hidden_state # (B, L, H)\n cls_emb = seq_emb[:, 0, :] # (B, H)\n \n # ---- Token Selector ----\n token_probs, token_logits = self.selector(seq_emb, cls_emb, self.training)\n \n # ---- Temporal CNN on FULL embeddings (NOT masked) ----\n temporal_feat = self.temporal_cnn(seq_emb, attention_mask)\n \n # ---- Rationale-Weighted Summary Vector H_r ----\n weights = token_probs.unsqueeze(-1) # (B, L, 1)\n H_r = (seq_emb * weights).sum(dim=1) / (weights.sum(dim=1) + 1e-6)\n \n # ================= Frozen Rationale BERT =================\n add_outputs = self.additional_model(\n input_ids=additional_input_ids,\n attention_mask=additional_attention_mask,\n )\n add_seq = add_outputs.last_hidden_state\n \n # ---- Multi-Scale Attention CNN ----\n msa_feat = self.msa_cnn(add_seq, additional_attention_mask)\n \n # ================= CONCAT (4 components) =================\n concat = torch.cat([cls_emb, temporal_feat, msa_feat, H_r], dim=1)\n \n logits = self.projection_mlp(concat)\n return logits","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class EarlyStopping:\n \"\"\"\n Early stopping to stop training when validation loss doesn't improve.\n \"\"\"\n def __init__(self, patience=10, min_delta=1.0, mode='min', verbose=True):\n \"\"\"\n Args:\n patience (int): How many epochs to wait after last improvement.\n min_delta (float): Minimum change to qualify as an improvement.\n mode (str): 'min' for loss, 'max' for accuracy/f1.\n verbose (bool): Print messages when improvement occurs.\n \"\"\"\n self.patience = patience\n self.min_delta = min_delta\n self.mode = mode\n self.verbose = verbose\n self.counter = 0\n self.best_score = None\n self.early_stop = False\n self.best_model_state = None\n \n def __call__(self, current_score, model):\n \"\"\"\n Call this after each epoch with the validation metric and model.\n \n Args:\n current_score: Current epoch's validation metric (loss, accuracy, f1, etc.)\n model: The model to save if there's improvement\n \n Returns:\n bool: True if training should stop, False otherwise\n \"\"\"\n if self.best_score is None:\n # First epoch\n self.best_score = current_score\n self.save_checkpoint(model)\n if self.verbose:\n print(f\"Initial best score: {self.best_score:.4f}\")\n else:\n # Check if there's improvement\n if self.mode == 'min':\n improved = current_score < (self.best_score - self.min_delta)\n else: # mode == 'max'\n improved = current_score > (self.best_score + self.min_delta)\n \n if improved:\n self.best_score = current_score\n self.save_checkpoint(model)\n self.counter = 0\n if self.verbose:\n print(f\"Validation improved! New best score: {self.best_score:.4f}\")\n else:\n self.counter += 1\n if self.verbose:\n print(f\"No improvement. Patience counter: {self.counter}/{self.patience}\")\n \n if self.counter >= self.patience:\n self.early_stop = True\n if self.verbose:\n print(f\"Early stopping triggered! Best score: {self.best_score:.4f}\")\n \n return self.early_stop\n \n def save_checkpoint(self, model):\n \"\"\"Save model state dict\"\"\"\n import copy\n self.best_model_state = copy.deepcopy(model.state_dict())\n \n def load_best_model(self, model):\n \"\"\"Load the best model state into the model\"\"\"\n if self.best_model_state is not None:\n model.load_state_dict(self.best_model_state)\n if self.verbose:\n print(f\"Loaded best model with score: {self.best_score:.4f}\")\n return model","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def main(args):\n torch.manual_seed(args.seed)\n torch.cuda.empty_cache()\n device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n\n\n file_map = {\n \"gab\": '/kaggle/input/datasets/jonniellm/final-dataset/Mistral_Rationales_file_GAB_dataset(85-15).csv',\n \"twitter\": '/kaggle/input/datasets/jonniellm/final-dataset/Mistral_Rationales_file_Twitter_dataset(85-15).csv',\n \"reddit\": '/kaggle/input/datasets/jonniellm/final-dataset/Mistral_Rationales_file_REDDIT_dataset(85-15).csv',\n \"youtube\": '/kaggle/input/datasets/jonniellm/final-dataset/Mistral_Rationales_file_YOUTUBE_dataset(85-15).csv',\n \"implicit\": '/kaggle/input/datasets/jonniellm/final-dataset/Mistral_Rationales_file_IMPLICIT_dataset(85-15).csv'\n }\n\n file_path = file_map[args.dataset]\n df = pd.read_csv(file_path)\n train_df = df[df['exp_split'] == 'train']\n test_df = df[df['exp_split'] == 'test']\n\n print(\"Train df: \", len(train_df))\n print(\"Test_df: \", len(test_df))\n\n import gc\n # del variables\n gc.collect()\n\n \n tokenizer = args.hate_tokenizer ## need this for tokenizing the input text in data loader\n tokenizer_bert = args.bert_tokenizer\n #Splitting training and validation testing split to test accuracy\n train_idx, val_idx = train_test_split(\n train_df.index,\n test_size=0.2,\n stratify=train_df[\"label\"],\n random_state=args.seed\n )\n \n if args.dataset == \"implicit\":\n train_text = train_df.loc[train_idx, \"post\"].tolist()\n val_texts = train_df.loc[val_idx, \"post\"].tolist()\n else:\n train_text = train_df.loc[train_idx, \"text\"].tolist()\n val_texts = train_df.loc[val_idx, \"text\"].tolist()\n \n add_train_text = train_df.loc[train_idx, \"Mistral_Rationales\"].tolist()\n add_val_texts = train_df.loc[val_idx, \"Mistral_Rationales\"].tolist()\n \n train_labels = train_df.loc[train_idx, \"label\"].tolist()\n val_labels = train_df.loc[val_idx, \"label\"].tolist()\n\n train_dataset = AdditionalCustomDataset(\n train_text,\n train_labels,\n add_train_text,\n tokenizer,\n tokenizer_bert,\n max_length=512\n )\n \n val_dataset = AdditionalCustomDataset(\n val_texts,\n val_labels,\n add_val_texts,\n tokenizer,\n tokenizer_bert,\n max_length=512\n )\n\n #Creating dataloader object to train the model\n # num_workers=2 for parallel data loading, pin_memory=True for faster GPU transfer\n train_dataloader = DataLoader(\n train_dataset, \n batch_size=args.batch_size, \n shuffle=True,\n num_workers=2,\n pin_memory=True if torch.cuda.is_available() else False\n )\n val_dataloader = DataLoader(\n val_dataset, \n batch_size=args.batch_size, \n shuffle=False,\n num_workers=2,\n pin_memory=True if torch.cuda.is_available() else False\n )\n hatebert_model = args.hatebert_model\n additional_model = args.additional_model\n \n \n temporal_cnn = TemporalCNN(\n hidden_size=768,\n num_filters=args.temp_filters, # 64 - 128\n kernel_sizes=(2, 3, 4),\n dropout=args.temp_dropout,\n dilation_base=args.temp_dilate # 0 - 4\n ).to(device)\n\n msa_cnn = MultiScaleAttentionCNN(\n hidden_size=768,\n num_filters=args.msa_filters, # 64 - 128\n kernel_sizes=(2, 3, 4),\n dropout=args.msa_dropout\n ).to(device)\n \n selector = GumbelTokenSelector(\n hidden_size=768,\n tau=1.0\n ).to(device)\n \n projection_mlp = ProjectionMLP(\n input_size=temporal_cnn.out_dim + msa_cnn.out_dim + 768 * 2,\n output_size=512\n ).to(device)\n\n\n\n concat_model = ConcatModel(\n hatebert_model=hatebert_model,\n additional_model=additional_model,\n temporal_cnn=temporal_cnn,\n msa_cnn=msa_cnn,\n selector=selector,\n projection_mlp=projection_mlp,\n unfreeze_n_layers_hate=args.hate_layers, #hatebert unfreeze all 12 layers id 12 by default\n unfreeze_n_layers_add=args.add_layers, # additional bert freeze all layers if 0 by default\n hate_pooler=args.hate_pooler, #bool to controle if pooler is frozen or not true=not frozen\n add_pooler=args.add_pooler #bool to controle if pooler is frozen or not true=not froze\n ).to(device)\n\n optimizer = AdamW(\n concat_model.parameters(),\n lr=args.lr, # 2e-5\n weight_decay=args.wd\n )\n criterion = nn.CrossEntropyLoss().to(device)\n\n # criterion = criterion.to(device)\n\n os.makedirs(\"/kaggle/working/models\", exist_ok=True)\n\n history = {\n \"train_loss\": [],\n \"val_loss\": [], #loss\n \"train_acc\": [],\n \"train_precision\": [],\n \"train_recall\": [],\n \"train_f1\": [], #train binary\n \"val_acc\": [],\n \"val_precision\": [],\n \"val_recall\": [],\n \"val_f1\": [], #validation binary\n \"train_acc_weighted\": [],\n \"train_precision_weighted\": [],\n \"train_recall_weighted\": [],\n \"train_f1_weighted\": [], # train weighted\n \"val_acc_weighted\": [],\n \"val_precision_weighted\": [],\n \"val_recall_weighted\": [],\n \"val_f1_weighted\": [], # validation weighted\n \"train_acc_macro\": [],\n \"train_precision_macro\": [],\n \"train_recall_macro\": [],\n \"train_f1_macro\": [], #train macro\n \"val_acc_macro\": [],\n \"val_precision_macro\": [],\n \"val_recall_macro\": [],\n \"val_f1_macro\": [],# validation macro\n \"epoch_time\": [],\n \"train_throughput\": [],\n \"val_confidence_mean\": [],\n \"val_confidence_std\": [],\n \"gpu_memory_mb\": [],\n }\n\n early_stopping = EarlyStopping(patience=args.patience, min_delta=0.001, mode='min', verbose=True) # early stop on loss\n\n for epoch in range(args.num_epochs):\n epoch_val_confidences = []\n epoch_start_time = time.time()\n samples_seen = 0\n \n concat_model.train()\n\n train_losses = []\n train_preds = []\n train_labels_epoch = []\n train_accuracy = 0\n train_epoch_size = 0\n\n with tqdm(train_dataloader, desc=f'Epoch {epoch + 1}', dynamic_ncols=True) as loop:\n for batch in loop:\n input_ids, attention_mask, additional_input_ids, additional_attention_mask, labels = batch\n \n samples_seen += labels.size(0)\n \n if torch.cuda.is_available():\n input_ids = input_ids.to(device)\n attention_mask = attention_mask.to(device)\n additional_input_ids = additional_input_ids.to(device)\n additional_attention_mask = additional_attention_mask.to(device)\n labels = labels.to(device)\n\n # Forward pass through the ConcatModel\n optimizer.zero_grad()\n outputs = concat_model(input_ids=input_ids, attention_mask=attention_mask, additional_input_ids=additional_input_ids, additional_attention_mask=additional_attention_mask)\n loss = criterion(outputs, labels)\n\n # Backward pass and optimization\n loss.backward()\n torch.nn.utils.clip_grad_norm_(concat_model.parameters(), max_norm=args.max_grad_norm)\n optimizer.step()\n \n probs = torch.softmax(outputs, dim=1)\n confidences, predictions = torch.max(probs, dim=1)\n train_preds.extend(predictions.cpu().numpy())\n train_labels_epoch.extend(labels.cpu().numpy())\n\n train_losses.append(loss.item())\n\n # Update accuracy and epoch size\n predictions = torch.argmax(outputs, dim=1)\n train_accuracy += (predictions == labels).sum().item()\n train_epoch_size += len(labels)\n \n epoch_train_time = time.time() - epoch_start_time\n train_throughput = samples_seen / epoch_train_time \n \n # Calculate train metrics (binary, weighted, macro)\n train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(\n train_labels_epoch, train_preds, average='binary'\n )\n train_precision_weighted, train_recall_weighted, train_f1_weighted, _ = precision_recall_fscore_support(\n train_labels_epoch, train_preds, average='weighted'\n )\n train_precision_macro, train_recall_macro, train_f1_macro, _ = precision_recall_fscore_support(\n train_labels_epoch, train_preds, average='macro'\n )\n train_acc = accuracy_score(train_labels_epoch, train_preds)\n\n # Evaluation on the validation set\n concat_model.eval()\n\n val_predictions = []\n val_labels_epoch = []\n val_loss = 0\n num_batches = 0\n\n with torch.no_grad(), tqdm(val_dataloader, desc='Validation', dynamic_ncols=True) as loop:\n for batch in loop:\n input_ids, attention_mask, additional_input_ids, additional_attention_mask, labels = batch\n\n if torch.cuda.is_available():\n input_ids = input_ids.to(device)\n attention_mask = attention_mask.to(device)\n additional_input_ids = additional_input_ids.to(device)\n additional_attention_mask = additional_attention_mask.to(device)\n labels = labels.to(device)\n\n # Forward pass through the ConcatModel\n outputs = concat_model(input_ids=input_ids, attention_mask=attention_mask, additional_input_ids=additional_input_ids, additional_attention_mask=additional_attention_mask)\n loss = criterion(outputs, labels)\n val_loss += loss.item()\n num_batches += 1\n probs = torch.softmax(outputs, dim=1)\n confidences, predictions = torch.max(probs, dim=1)\n\n epoch_val_confidences.extend(confidences.cpu().numpy())\n \n val_predictions.extend(predictions.cpu().numpy())\n val_labels_epoch.extend(labels.cpu().numpy())\n \n val_loss /= num_batches\n \n # Calculate validation metrics (binary)\n val_accuracy = accuracy_score(val_labels_epoch, val_predictions)\n val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(\n val_labels_epoch, val_predictions, average='binary'\n )\n # Calculate validation metrics (weighted)\n val_precision_weighted, val_recall_weighted, val_f1_weighted, _ = precision_recall_fscore_support(\n val_labels_epoch, val_predictions, average='weighted'\n )\n # Calculate validation metrics (macro)\n val_precision_macro, val_recall_macro, val_f1_macro, _ = precision_recall_fscore_support(\n val_labels_epoch, val_predictions, average='macro'\n )\n \n print(f\"Epoch {epoch}:\")\n print(f\" Train Accuracy: {train_acc:.4f}\")\n print(f\" Validation Accuracy: {val_accuracy:.4f}\")\n print(f\" Train Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}\")\n print(f\" Val Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}\")\n print(f\" Avg. Train Loss: {sum(train_losses) / len(train_losses):.4f}\")\n print(f\" Validation Loss: {val_loss:.4f}\")\n epoch_time = time.time() - epoch_start_time\n conf_mean = np.mean(epoch_val_confidences)\n conf_std = np.std(epoch_val_confidences)\n\n # Append all train metrics to history\n history[\"train_loss\"].append(np.mean(train_losses))\n history[\"train_acc\"].append(train_acc)\n history[\"train_precision\"].append(train_precision)\n history[\"train_recall\"].append(train_recall)\n history[\"train_f1\"].append(train_f1)\n history[\"train_acc_weighted\"].append(train_acc) # accuracy is same for all averaging\n history[\"train_precision_weighted\"].append(train_precision_weighted)\n history[\"train_recall_weighted\"].append(train_recall_weighted)\n history[\"train_f1_weighted\"].append(train_f1_weighted)\n history[\"train_acc_macro\"].append(train_acc) # accuracy is same for all averaging\n history[\"train_precision_macro\"].append(train_precision_macro)\n history[\"train_recall_macro\"].append(train_recall_macro)\n history[\"train_f1_macro\"].append(train_f1_macro)\n \n # Append all validation metrics to history\n history[\"val_loss\"].append(val_loss)\n history[\"val_acc\"].append(val_accuracy)\n history[\"val_precision\"].append(val_precision)\n history[\"val_recall\"].append(val_recall)\n history[\"val_f1\"].append(val_f1)\n history[\"val_acc_weighted\"].append(val_accuracy) # accuracy is same for all averaging\n history[\"val_precision_weighted\"].append(val_precision_weighted)\n history[\"val_recall_weighted\"].append(val_recall_weighted)\n history[\"val_f1_weighted\"].append(val_f1_weighted)\n history[\"val_acc_macro\"].append(val_accuracy) # accuracy is same for all averaging\n history[\"val_precision_macro\"].append(val_precision_macro)\n history[\"val_recall_macro\"].append(val_recall_macro)\n history[\"val_f1_macro\"].append(val_f1_macro)\n \n # Append efficiency metrics\n history[\"epoch_time\"].append(epoch_time)\n history[\"train_throughput\"].append(train_throughput)\n history[\"val_confidence_mean\"].append(conf_mean)\n history[\"val_confidence_std\"].append(conf_std)\n \n if torch.cuda.is_available():\n history[\"gpu_memory_mb\"].append(\n torch.cuda.max_memory_allocated() / 1024**2\n )\n torch.cuda.reset_peak_memory_stats()\n else:\n history[\"gpu_memory_mb\"].append(0)\n \n print(f\" Epoch Time (s): {epoch_time:.2f}\")\n print(f\" Throughput (samples/sec): {train_throughput:.2f}\")\n print(f\" Val Confidence Mean: {conf_mean:.4f} ± {conf_std:.4f}\")\n \n current_metric = val_loss\n if early_stopping(current_metric, concat_model):\n print(f\"\\n{'='*50}\")\n print(f\"Early stopping at epoch {epoch+1}\")\n print(f\"{'='*50}\\n\")\n break\n model = early_stopping.load_best_model(concat_model)\n torch.save(model.state_dict(), f\"/kaggle/working/models/{args.dataset}_concat_model.pt\")\n print(f\"Best model saved to /kaggle/working/models/{args.dataset}_concat_model.pt\")\n\n checkpoint = {\n \"model_state_dict\": model.state_dict(),\n \"history\": history,\n }\n torch.save(checkpoint, f\"/kaggle/working/models/{args.dataset}_concat_checkpoint.pt\")\n print(f\"Checkpoint with history saved to /kaggle/working/models/{args.dataset}_concat_checkpoint.pt\")\n\n if args.dataset == \"implicit\":\n test_texts = test_df[\"post\"].tolist()\n else:\n test_texts = test_df[\"text\"].tolist()\n \n add_test_texts = test_df[\"Mistral_Rationales\"].tolist()\n test_labels = test_df[\"label\"].tolist()\n\n test_dataset = AdditionalCustomDataset(test_texts, test_labels, add_test_texts, tokenizer, tokenizer_bert, max_length=512)\n test_dataloader = DataLoader(\n test_dataset, \n batch_size=args.batch_size, # Use same batch size as training for faster inference\n shuffle=False,\n num_workers=2,\n pin_memory=True if torch.cuda.is_available() else False\n )\n\n # ================= TEST EVALUATION WITH EFFICIENCY =================\n model.eval()\n test_predictions = []\n test_true_labels = []\n test_confidences = []\n\n samples_seen = 0\n test_start_time = time.time()\n \n if torch.cuda.is_available():\n torch.cuda.reset_peak_memory_stats()\n \n with torch.no_grad(), tqdm(test_dataloader, desc='Testing', dynamic_ncols=True) as loop:\n for batch in loop:\n input_ids, attention_mask, additional_input_ids, additional_attention_mask, labels = batch\n \n batch_size = labels.size(0)\n samples_seen += batch_size\n \n input_ids = input_ids.to(device)\n attention_mask = attention_mask.to(device)\n additional_input_ids = additional_input_ids.to(device)\n additional_attention_mask = additional_attention_mask.to(device)\n labels = labels.to(device)\n \n outputs = model(\n input_ids=input_ids,\n attention_mask=attention_mask,\n additional_input_ids=additional_input_ids,\n additional_attention_mask=additional_attention_mask\n )\n \n probs = torch.softmax(outputs, dim=1)\n confidences, preds = torch.max(probs, dim=1)\n \n test_confidences.extend(confidences.cpu().numpy())\n test_predictions.extend(preds.cpu().numpy())\n test_true_labels.extend(labels.cpu().numpy())\n\n \n # ================= TEST METRICS =================\n test_time = time.time() - test_start_time\n test_throughput = samples_seen / test_time\n \n accuracy = accuracy_score(test_true_labels, test_predictions)\n precision, recall, f1, _ = precision_recall_fscore_support(\n test_true_labels, test_predictions, average='weighted'\n )\n conf_mean = np.mean(test_confidences)\n conf_std = np.std(test_confidences)\n cm = confusion_matrix(test_true_labels, test_predictions)\n \n gpu_memory_mb = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0\n\n print(\"\\n================= FINAL TEST RESULTS =================\")\n print(f\"Dataset: {args.dataset}, Seed: {args.seed}, Epochs: {args.num_epochs}\")\n print(f\"Test Accuracy : {accuracy:.4f}\")\n print(f\"Test Precision: {precision:.4f}\")\n print(f\"Test Recall : {recall:.4f}\")\n print(f\"Test F1-score : {f1:.4f}\")\n print(f\"Test Confidence Mean ± Std: {conf_mean:.4f} ± {conf_std:.4f}\")\n print(f\"Test Time (s) : {test_time:.2f}\")\n print(f\"Throughput (samples/sec) : {test_throughput:.2f}\")\n print(f\"Peak GPU Memory (MB) : {gpu_memory_mb:.2f}\")\n print(\"\\nClassification Report:\")\n print(classification_report(test_true_labels, test_predictions))\n print(\"\\nConfusion Matrix:\")\n # Plot confusion matrix with seaborn\n plt.figure(figsize=(8, 6))\n sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', \n xticklabels=['Non-Hate', 'Hate'],\n yticklabels=['Non-Hate', 'Hate'],\n cbar_kws={'label': 'Count'})\n plt.title('Confusion Matrix', fontsize=14, pad=20)\n plt.ylabel('True label', fontsize=12)\n plt.xlabel('Predicted label', fontsize=12)\n plt.tight_layout()\n plt.show()\n print(\"======================================================\")\n\n return model, history","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from argparse import Namespace\nfrom transformers import AutoTokenizer, AutoModel\n\n# Load tokenizers and models ONCE (outside objective to save time)\ndevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\nhate_tokenizer = AutoTokenizer.from_pretrained(\"GroNLP/hateBERT\")\nbert_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n\ntorch.cuda.empty_cache()\n\n# Load fresh models for each trial (to reset weights)\nhatebert_model = AutoModel.from_pretrained(\"GroNLP/hateBERT\").to(device)\nadditional_model = AutoModel.from_pretrained(\"bert-base-uncased\").to(device)\n\n# Sample hyperparameters\nargs = Namespace(\n # Fixed\n seed=42,\n dataset=\"reddit\", # Change as needed: \"gab\", \"twitter\", \"reddit\", \"youtube\", \"implicit\"\n \n # Tokenizers & Models\n hate_tokenizer=hate_tokenizer,\n bert_tokenizer=bert_tokenizer,\n hatebert_model=hatebert_model,\n additional_model=additional_model,\n \n # Training hyperparameters\n batch_size=32,\n num_epochs=20,\n lr=1e-5,\n wd=0.01,\n patience=2,\n max_grad_norm=2.0,\n \n # TemporalCNN hyperparameters\n temp_filters=256,\n temp_dropout=0.1,\n temp_dilate=3,\n \n # MultiScaleAttentionCNN hyperparameters\n msa_filters=64,\n msa_dropout=0.23,\n \n # Layer unfreezing hyperparameters\n hate_layers=10,\n add_layers=0,\n hate_pooler=True,\n add_pooler=True,\n)\n \nmodel, history = main(args)\n\n\n ","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"fig, axes = plt.subplots(3, 4, figsize=(20, 15))\n\n# Row 1: Train vs Val comparisons\n# Accuracy\naxes[0, 0].plot(history['train_acc'], label='Train', marker='o')\naxes[0, 0].plot(history['val_acc'], label='Validation', marker='s')\naxes[0, 0].set_title('Accuracy')\naxes[0, 0].set_xlabel('Epoch')\naxes[0, 0].set_ylabel('Accuracy')\naxes[0, 0].legend()\naxes[0, 0].grid(True)\n\n# Loss\naxes[0, 1].plot(history['train_loss'], label='Train', marker='o')\naxes[0, 1].plot(history['val_loss'], label='Validation', marker='s')\naxes[0, 1].set_title('Loss')\naxes[0, 1].set_xlabel('Epoch')\naxes[0, 1].set_ylabel('Loss')\naxes[0, 1].legend()\naxes[0, 1].grid(True)\n\n# F1 Score\naxes[0, 2].plot(history['train_f1'], label='Train', marker='o')\naxes[0, 2].plot(history['val_f1'], label='Validation', marker='s')\naxes[0, 2].set_title('F1 Score (Binary)')\naxes[0, 2].set_xlabel('Epoch')\naxes[0, 2].set_ylabel('F1')\naxes[0, 2].legend()\naxes[0, 2].grid(True)\n\n# Precision\naxes[0, 3].plot(history['train_precision'], label='Train', marker='o')\naxes[0, 3].plot(history['val_precision'], label='Validation', marker='s')\naxes[0, 3].set_title('Precision (Binary)')\naxes[0, 3].set_xlabel('Epoch')\naxes[0, 3].set_ylabel('Precision')\naxes[0, 3].legend()\naxes[0, 3].grid(True)\n\n# Row 2: More Train vs Val + Individual metrics\n# Recall\naxes[1, 0].plot(history['train_recall'], label='Train', marker='o')\naxes[1, 0].plot(history['val_recall'], label='Validation', marker='s')\naxes[1, 0].set_title('Recall (Binary)')\naxes[1, 0].set_xlabel('Epoch')\naxes[1, 0].set_ylabel('Recall')\naxes[1, 0].legend()\naxes[1, 0].grid(True)\n\n# Epoch Time\naxes[1, 1].plot(history['epoch_time'], marker='o', color='green')\naxes[1, 1].set_title('Epoch Time')\naxes[1, 1].set_xlabel('Epoch')\naxes[1, 1].set_ylabel('Time (s)')\naxes[1, 1].grid(True)\n\n# Train Throughput\naxes[1, 2].plot(history['train_throughput'], marker='o', color='purple')\naxes[1, 2].set_title('Train Throughput')\naxes[1, 2].set_xlabel('Epoch')\naxes[1, 2].set_ylabel('Samples/sec')\naxes[1, 2].grid(True)\n\n# Validation Confidence\naxes[1, 3].errorbar(range(len(history['val_confidence_mean'])), \n history['val_confidence_mean'], \n yerr=history['val_confidence_std'], \n marker='o', color='orange', capsize=3)\naxes[1, 3].set_title('Validation Confidence')\naxes[1, 3].set_xlabel('Epoch')\naxes[1, 3].set_ylabel('Confidence')\naxes[1, 3].grid(True)\n\n# Row 3: GPU Memory and Weighted/Macro metrics\n# GPU Memory\naxes[2, 0].plot(history['gpu_memory_mb'], marker='o', color='red')\naxes[2, 0].set_title('GPU Memory Usage')\naxes[2, 0].set_xlabel('Epoch')\naxes[2, 0].set_ylabel('Memory (MB)')\naxes[2, 0].grid(True)\n\n# Weighted F1 comparison\naxes[2, 1].plot(history['train_f1_weighted'], label='Train', marker='o')\naxes[2, 1].plot(history['val_f1_weighted'], label='Validation', marker='s')\naxes[2, 1].set_title('F1 Score (Weighted)')\naxes[2, 1].set_xlabel('Epoch')\naxes[2, 1].set_ylabel('F1')\naxes[2, 1].legend()\naxes[2, 1].grid(True)\n\n# Macro F1 comparison\naxes[2, 2].plot(history['train_f1_macro'], label='Train', marker='o')\naxes[2, 2].plot(history['val_f1_macro'], label='Validation', marker='s')\naxes[2, 2].set_title('F1 Score (Macro)')\naxes[2, 2].set_xlabel('Epoch')\naxes[2, 2].set_ylabel('F1')\naxes[2, 2].legend()\naxes[2, 2].grid(True)\n\n# Hide last subplot\naxes[2, 3].axis('off')\n\nplt.suptitle('Training History', fontsize=16, y=1.02)\nplt.tight_layout()\nplt.show()","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}
|
pyproject.toml
CHANGED
|
@@ -13,6 +13,7 @@ dependencies = [
|
|
| 13 |
"pandas>=2.3.3",
|
| 14 |
"plotly>=5.14.0",
|
| 15 |
"psutil>=7.2.2",
|
|
|
|
| 16 |
"regex>=2023.5.0",
|
| 17 |
"requests>=2.31.0",
|
| 18 |
"safetensors>=0.3.0",
|
|
|
|
| 13 |
"pandas>=2.3.3",
|
| 14 |
"plotly>=5.14.0",
|
| 15 |
"psutil>=7.2.2",
|
| 16 |
+
"python-dotenv>=1.0.0",
|
| 17 |
"regex>=2023.5.0",
|
| 18 |
"requests>=2.31.0",
|
| 19 |
"safetensors>=0.3.0",
|
src/app.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from hatespeech_model import predict_hatespeech, load_model_from_hf, predict_hatespeech_from_file, get_rationale_from_mistral, preprocess_rationale_mistral
|
| 3 |
import plotly.graph_objects as go
|
|
@@ -192,6 +194,8 @@ if classify_button:
|
|
| 192 |
model_type="base"
|
| 193 |
)
|
| 194 |
base_end = time.time()
|
|
|
|
|
|
|
| 195 |
|
| 196 |
# Extract results for both models
|
| 197 |
base_prediction = base_model_result['prediction']
|
|
@@ -400,6 +404,8 @@ if classify_button:
|
|
| 400 |
st.success("✅ File analysis complete for both models!")
|
| 401 |
st.divider()
|
| 402 |
st.header("📊 Analysis Results - Model Comparison")
|
|
|
|
|
|
|
| 403 |
|
| 404 |
# Side-by-side results columns
|
| 405 |
base_file_col, enhanced_file_col = st.columns(2)
|
|
|
|
| 1 |
+
import gc
|
| 2 |
+
|
| 3 |
import streamlit as st
|
| 4 |
from hatespeech_model import predict_hatespeech, load_model_from_hf, predict_hatespeech_from_file, get_rationale_from_mistral, preprocess_rationale_mistral
|
| 5 |
import plotly.graph_objects as go
|
|
|
|
| 194 |
model_type="base"
|
| 195 |
)
|
| 196 |
base_end = time.time()
|
| 197 |
+
|
| 198 |
+
gc.collect() # Clean up memory after inference
|
| 199 |
|
| 200 |
# Extract results for both models
|
| 201 |
base_prediction = base_model_result['prediction']
|
|
|
|
| 404 |
st.success("✅ File analysis complete for both models!")
|
| 405 |
st.divider()
|
| 406 |
st.header("📊 Analysis Results - Model Comparison")
|
| 407 |
+
|
| 408 |
+
gc.collect() # Clean up memory after file inference
|
| 409 |
|
| 410 |
# Side-by-side results columns
|
| 411 |
base_file_col, enhanced_file_col = st.columns(2)
|
src/hatespeech_model.py
CHANGED
|
@@ -12,10 +12,14 @@ import os
|
|
| 12 |
import numpy as np
|
| 13 |
import requests
|
| 14 |
import json
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def create_prompt(text):
|
| 21 |
return f"""
|
|
@@ -380,11 +384,7 @@ def load_model_from_hf(model_type="altered"):
|
|
| 380 |
config = json.load(f)
|
| 381 |
|
| 382 |
# Load checkpoint with proper handling for numpy dtypes (PyTorch 2.6+ compatibility)
|
| 383 |
-
|
| 384 |
-
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
|
| 385 |
-
except TypeError:
|
| 386 |
-
# Fallback for older PyTorch versions
|
| 387 |
-
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
|
| 388 |
|
| 389 |
# Handle nested config structure (base model uses model_config, altered uses flat structure)
|
| 390 |
if 'model_config' in config:
|
|
@@ -403,7 +403,6 @@ def load_model_from_hf(model_type="altered"):
|
|
| 403 |
|
| 404 |
# Rebuild architecture based on model type using training_config values when available
|
| 405 |
H = hatebert_model.config.hidden_size
|
| 406 |
-
max_length = training_config.get('max_length', 128)
|
| 407 |
|
| 408 |
# common params from training config (use None to allow inference from checkpoint)
|
| 409 |
adapter_dim = training_config.get('adapter_dim', training_config.get('adapter_size', None))
|
|
@@ -535,10 +534,6 @@ def load_model_from_hf(model_type="altered"):
|
|
| 535 |
|
| 536 |
model = model.to(device)
|
| 537 |
|
| 538 |
-
# Verify model is in eval mode
|
| 539 |
-
print(f"Model training mode: {model.training}")
|
| 540 |
-
print(f"Dropout layers found: {sum(1 for _ in model.modules() if isinstance(_, (nn.Dropout, nn.Dropout1d, nn.Dropout2d, nn.Dropout3d)))}")
|
| 541 |
-
|
| 542 |
# Create a unified config dict with max_length at top level for compatibility
|
| 543 |
unified_config = config.copy()
|
| 544 |
if 'max_length' not in unified_config and 'training_config' in config:
|
|
@@ -546,30 +541,6 @@ def load_model_from_hf(model_type="altered"):
|
|
| 546 |
|
| 547 |
return model, tokenizer_hatebert, tokenizer_rationale, unified_config, device
|
| 548 |
|
| 549 |
-
|
| 550 |
-
def combined_loss(logits, labels, rationale_probs, selector_logits, rationale_mask=None, attns=None, attn_weight=0.0, rationale_weight=1.0):
|
| 551 |
-
cls_loss = F.cross_entropy(logits, labels)
|
| 552 |
-
|
| 553 |
-
# supervise selector logits with BCE-with-logits against rationale mask (if available)
|
| 554 |
-
if rationale_mask is not None:
|
| 555 |
-
selector_loss = F.binary_cross_entropy_with_logits(selector_logits, rationale_mask.to(selector_logits.device))
|
| 556 |
-
else:
|
| 557 |
-
selector_loss = torch.tensor(0.0, device=cls_loss.device)
|
| 558 |
-
|
| 559 |
-
# optional attention alignment loss (disabled by default)
|
| 560 |
-
attn_loss = torch.tensor(0.0, device=cls_loss.device)
|
| 561 |
-
if attns is not None and attn_weight > 0.0:
|
| 562 |
-
try:
|
| 563 |
-
last_attn = attns[-1] # (B, H, L, L)
|
| 564 |
-
attn_mass = last_attn.mean(1).mean(1) # (B, L)
|
| 565 |
-
attn_loss = F.mse_loss(attn_mass, rationale_mask.to(attn_mass.device))
|
| 566 |
-
except Exception:
|
| 567 |
-
attn_loss = torch.tensor(0.0, device=cls_loss.device)
|
| 568 |
-
|
| 569 |
-
total_loss = cls_loss + rationale_weight * selector_loss + attn_weight * attn_loss
|
| 570 |
-
return total_loss, cls_loss.item(), selector_loss.item(), attn_loss.item()
|
| 571 |
-
|
| 572 |
-
|
| 573 |
def predict_text(text, rationale, model, tokenizer_hatebert, tokenizer_rationale,
|
| 574 |
device='cpu', max_length=128, model_type="altered"):
|
| 575 |
# Ensure model is in eval mode (defensive programming)
|
|
@@ -622,15 +593,14 @@ def predict_text(text, rationale, model, tokenizer_hatebert, tokenizer_rationale
|
|
| 622 |
probs = F.softmax(scaled_logits, dim=1)
|
| 623 |
|
| 624 |
if torch.isnan(probs).any() or torch.isinf(probs).any():
|
| 625 |
-
print(f"WARNING: NaN or Inf in probabilities. Logits: {logits}")
|
| 626 |
# Fallback to uniform distribution
|
| 627 |
probs = torch.ones_like(logits) / logits.size(1)
|
| 628 |
|
| 629 |
prediction = logits.argmax(dim=1).item()
|
| 630 |
confidence = probs[0, prediction].item()
|
| 631 |
|
| 632 |
-
# Debug: Print logits and probs for first few predictions
|
| 633 |
-
print(f"Debug - Logits: {logits[0].cpu().numpy()}, Probs: {probs[0].cpu().numpy()}")
|
| 634 |
|
| 635 |
result = {
|
| 636 |
'prediction': prediction,
|
|
|
|
| 12 |
import numpy as np
|
| 13 |
import requests
|
| 14 |
import json
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
|
| 17 |
+
# Load environment variables from .env file
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
API_BASE_URL = os.getenv("CLOUDFLARE_API_BASE_URL")
|
| 21 |
+
HEADERS = {"Authorization": f"Bearer {os.getenv('CLOUDFLARE_API_TOKEN')}"}
|
| 22 |
+
MODEL_NAME = os.getenv("CLOUDFLARE_MODEL_NAME")
|
| 23 |
|
| 24 |
def create_prompt(text):
|
| 25 |
return f"""
|
|
|
|
| 384 |
config = json.load(f)
|
| 385 |
|
| 386 |
# Load checkpoint with proper handling for numpy dtypes (PyTorch 2.6+ compatibility)
|
| 387 |
+
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
|
| 389 |
# Handle nested config structure (base model uses model_config, altered uses flat structure)
|
| 390 |
if 'model_config' in config:
|
|
|
|
| 403 |
|
| 404 |
# Rebuild architecture based on model type using training_config values when available
|
| 405 |
H = hatebert_model.config.hidden_size
|
|
|
|
| 406 |
|
| 407 |
# common params from training config (use None to allow inference from checkpoint)
|
| 408 |
adapter_dim = training_config.get('adapter_dim', training_config.get('adapter_size', None))
|
|
|
|
| 534 |
|
| 535 |
model = model.to(device)
|
| 536 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
# Create a unified config dict with max_length at top level for compatibility
|
| 538 |
unified_config = config.copy()
|
| 539 |
if 'max_length' not in unified_config and 'training_config' in config:
|
|
|
|
| 541 |
|
| 542 |
return model, tokenizer_hatebert, tokenizer_rationale, unified_config, device
|
| 543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
def predict_text(text, rationale, model, tokenizer_hatebert, tokenizer_rationale,
|
| 545 |
device='cpu', max_length=128, model_type="altered"):
|
| 546 |
# Ensure model is in eval mode (defensive programming)
|
|
|
|
| 593 |
probs = F.softmax(scaled_logits, dim=1)
|
| 594 |
|
| 595 |
if torch.isnan(probs).any() or torch.isinf(probs).any():
|
|
|
|
| 596 |
# Fallback to uniform distribution
|
| 597 |
probs = torch.ones_like(logits) / logits.size(1)
|
| 598 |
|
| 599 |
prediction = logits.argmax(dim=1).item()
|
| 600 |
confidence = probs[0, prediction].item()
|
| 601 |
|
| 602 |
+
# # Debug: Print logits and probs for first few predictions
|
| 603 |
+
# print(f"Debug - Logits: {logits[0].cpu().numpy()}, Probs: {probs[0].cpu().numpy()}")
|
| 604 |
|
| 605 |
result = {
|
| 606 |
'prediction': prediction,
|
uv.lock
CHANGED
|
@@ -676,6 +676,7 @@ dependencies = [
|
|
| 676 |
{ name = "pandas" },
|
| 677 |
{ name = "plotly" },
|
| 678 |
{ name = "psutil" },
|
|
|
|
| 679 |
{ name = "regex" },
|
| 680 |
{ name = "requests" },
|
| 681 |
{ name = "safetensors" },
|
|
@@ -697,6 +698,7 @@ requires-dist = [
|
|
| 697 |
{ name = "pandas", specifier = ">=2.3.3" },
|
| 698 |
{ name = "plotly", specifier = ">=5.14.0" },
|
| 699 |
{ name = "psutil", specifier = ">=7.2.2" },
|
|
|
|
| 700 |
{ name = "regex", specifier = ">=2023.5.0" },
|
| 701 |
{ name = "requests", specifier = ">=2.31.0" },
|
| 702 |
{ name = "safetensors", specifier = ">=0.3.0" },
|
|
@@ -812,6 +814,15 @@ wheels = [
|
|
| 812 |
{ url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
|
| 813 |
]
|
| 814 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 815 |
[[package]]
|
| 816 |
name = "pytz"
|
| 817 |
version = "2025.2"
|
|
|
|
| 676 |
{ name = "pandas" },
|
| 677 |
{ name = "plotly" },
|
| 678 |
{ name = "psutil" },
|
| 679 |
+
{ name = "python-dotenv" },
|
| 680 |
{ name = "regex" },
|
| 681 |
{ name = "requests" },
|
| 682 |
{ name = "safetensors" },
|
|
|
|
| 698 |
{ name = "pandas", specifier = ">=2.3.3" },
|
| 699 |
{ name = "plotly", specifier = ">=5.14.0" },
|
| 700 |
{ name = "psutil", specifier = ">=7.2.2" },
|
| 701 |
+
{ name = "python-dotenv", specifier = ">=1.0.0" },
|
| 702 |
{ name = "regex", specifier = ">=2023.5.0" },
|
| 703 |
{ name = "requests", specifier = ">=2.31.0" },
|
| 704 |
{ name = "safetensors", specifier = ">=0.3.0" },
|
|
|
|
| 814 |
{ url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
|
| 815 |
]
|
| 816 |
|
| 817 |
+
[[package]]
|
| 818 |
+
name = "python-dotenv"
|
| 819 |
+
version = "1.2.2"
|
| 820 |
+
source = { registry = "https://pypi.org/simple" }
|
| 821 |
+
sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" }
|
| 822 |
+
wheels = [
|
| 823 |
+
{ url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" },
|
| 824 |
+
]
|
| 825 |
+
|
| 826 |
[[package]]
|
| 827 |
name = "pytz"
|
| 828 |
version = "2025.2"
|