AlbertCAC commited on
Commit
3c27def
·
0 Parent(s):
DOCKERFILE ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.10-slim
5
+
6
+ WORKDIR /app
7
+
8
+ COPY requirements.txt .
9
+
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY . .
13
+
14
+ EXPOSE 7860
15
+
16
+ CMD ["uvicorn", "lite_DETECTIVE.app:app", "--host", "0.0.0.0", "--port", "7860"]
cold/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LiteDetective - Malicious Content Detection Pipeline
3
+
4
+ Copyright (c) 2025 Albert Zhao
5
+ Author: Albert Zhao Zhaoq@kean.edu Hu Mingcheng
6
+ Created: 2025-05-11
7
+ Updated: 2025-05-11
8
+
9
+ Description:
10
+ Package containing model implementations.
11
+
12
+ License: MIT License
13
+ """
cold/__main__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from .classifier import ToxicTextClassifier
3
+
4
+ def getArgs():
5
+ parser = argparse.ArgumentParser(description="LiteDetective - Malicious Content Detection Pipeline")
6
+ parser.add_argument("--path", type=str, default="output/cold.pth", required=False, help="Path to the model")
7
+ parser.add_argument("--device", type=str, default="cpu", required=False, help="Device to use (cpu, mps, or cuda)")
8
+ parser.add_argument("args", nargs='+', help="the text to detect")
9
+
10
+ args = parser.parse_args()
11
+ return args
12
+
13
+ def main():
14
+ args = getArgs()
15
+ model = ToxicTextClassifier(path=args.path)
16
+ result = model.predict(args.args, device=args.device)
17
+ print(result)
18
+
19
+ if __name__ == "__main__":
20
+ main()
cold/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (434 Bytes). View file
 
cold/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (443 Bytes). View file
 
cold/__pycache__/__main__.cpython-312.pyc ADDED
Binary file (1.42 kB). View file
 
cold/__pycache__/classifier.cpython-310.pyc ADDED
Binary file (8.21 kB). View file
 
cold/__pycache__/classifier.cpython-312.pyc ADDED
Binary file (14.4 kB). View file
 
cold/__pycache__/dynamic_conv.cpython-310.pyc ADDED
Binary file (1.8 kB). View file
 
cold/__pycache__/dynamic_conv.cpython-312.pyc ADDED
Binary file (2.53 kB). View file
 
cold/__pycache__/predict.cpython-312.pyc ADDED
Binary file (1.42 kB). View file
 
cold/__pycache__/text_cnn.cpython-310.pyc ADDED
Binary file (1.63 kB). View file
 
cold/__pycache__/text_cnn.cpython-312.pyc ADDED
Binary file (1.97 kB). View file
 
cold/classifier.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import BertModel, BertTokenizer
4
+ from torch.optim import AdamW, lr_scheduler
5
+ from .text_cnn import DynamicTextCNN
6
+ from sklearn.metrics import classification_report, confusion_matrix
7
+ from tqdm import tqdm
8
+ import os
9
+
10
+ class ToxicTextClassifier(nn.Module):
11
+ def __init__(self,
12
+ bert_name='hfl/chinese-roberta-wwm-ext',
13
+ num_filters=128,
14
+ filter_sizes=(1,2,3,4),
15
+ K=4,
16
+ fc_dim=128,
17
+ num_classes=2,
18
+ dropout=0.1,
19
+ name='lite',
20
+ path=None,
21
+ ):
22
+ super().__init__()
23
+ self.tokenizer = BertTokenizer.from_pretrained(bert_name,from_tf=True)
24
+ self.bert = BertModel.from_pretrained(bert_name)
25
+ self.name = name
26
+ self.unfrozen_layers = 0
27
+
28
+
29
+ hidden_size = self.bert.config.hidden_size * 2
30
+ os.makedirs(f'data/{name}', exist_ok=True)
31
+
32
+ self.text_cnn = DynamicTextCNN(hidden_size, num_filters, filter_sizes, K, dropout)
33
+ input_dim = len(filter_sizes) * num_filters
34
+ self.classifier = nn.Sequential(
35
+ nn.Linear(input_dim, fc_dim),
36
+ nn.ReLU(),
37
+ nn.LayerNorm(fc_dim),
38
+ nn.Dropout(dropout),
39
+ nn.Linear(fc_dim, fc_dim // 2),
40
+ nn.ReLU(),
41
+ nn.Dropout(dropout),
42
+ nn.Linear(fc_dim // 2, num_classes)
43
+ )
44
+
45
+ self.criterion = nn.CrossEntropyLoss()
46
+ self._rebuild_optimizer()
47
+
48
+ if path is None:
49
+ path = f'output/{name}.pth'
50
+ if os.path.exists(path):
51
+ self.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
52
+ print(f"Model loaded from {path}")
53
+ else:
54
+ raise FileNotFoundError(f"You moved the default model path, we did not find it.")
55
+ if os.path.exists(path):
56
+ self.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
57
+ print(f"Model loaded from {path}")
58
+ else:
59
+ raise FileNotFoundError(f"Model file {path} not found.")
60
+
61
+ def _rebuild_optimizer(self):
62
+ """@deprecated
63
+ """
64
+ param_groups = [
65
+ {'params': self.text_cnn.parameters(), 'lr': 1e-4},
66
+ {'params': self.classifier.parameters(), 'lr': 1e-4},
67
+ ]
68
+
69
+ if self.unfrozen_layers > 0:
70
+ layers = self.bert.encoder.layer[-self.unfrozen_layers:]
71
+ bert_params = []
72
+ for layer in layers:
73
+ for p in layer.parameters():
74
+ p.requires_grad = True
75
+ bert_params.append(p)
76
+ param_groups.append({'params': bert_params, 'lr': 2e-5})
77
+
78
+ self.optimizer = AdamW(param_groups, weight_decay=0.01)
79
+
80
+ self.scheduler = lr_scheduler.ReduceLROnPlateau(
81
+ self.optimizer,
82
+ mode='min',
83
+ factor=0.5,
84
+ patience=2,
85
+ )
86
+
87
+ def forward(self, input_ids, attention_mask, token_type_ids=None):
88
+ bert_out = self.bert(
89
+ input_ids=input_ids,
90
+ attention_mask=attention_mask,
91
+ token_type_ids=token_type_ids,
92
+ output_hidden_states=True,
93
+ )
94
+ hidden = torch.cat(bert_out.hidden_states[-2:], dim=-1)
95
+ feat = self.text_cnn(hidden)
96
+ return self.classifier(feat)
97
+
98
+ def validate(self, val_loader, device):
99
+ self.eval()
100
+ val_loss = 0
101
+ correct = 0
102
+ total = 0
103
+ all_preds = []
104
+ all_labels = []
105
+
106
+ with torch.no_grad():
107
+ pbar = tqdm(val_loader, desc='Validating')
108
+ for batch in pbar:
109
+ ids = batch['input_ids'].to(device)
110
+ mask = batch['attention_mask'].to(device)
111
+ types = batch['token_type_ids'].to(device)
112
+ labels = batch['label'].to(device)
113
+
114
+ logits = self(ids, mask, types)
115
+ loss = self.criterion(logits, labels)
116
+ val_loss += loss.item()
117
+
118
+ preds = torch.argmax(logits, dim=1)
119
+ correct += (preds == labels).sum().item()
120
+ total += labels.size(0)
121
+
122
+ all_preds.extend(preds.cpu().tolist())
123
+ all_labels.extend(labels.cpu().tolist())
124
+
125
+ pbar.set_postfix({'loss': f'{loss.item():.4f}'})
126
+
127
+ epoch_acc = correct / total if total > 0 else 0
128
+ metrics = {
129
+ 'loss': val_loss / len(val_loader),
130
+ 'acc': epoch_acc,
131
+ 'report': classification_report(all_labels, all_preds, target_names=['non-toxic','toxic']),
132
+ 'confusion_matrix': confusion_matrix(all_labels, all_preds)
133
+ }
134
+ torch.cuda.empty_cache()
135
+ return metrics
136
+
137
+ def train_model(self, train_loader, val_loader,
138
+ num_epochs=3, device='cpu',
139
+ save_path=None,
140
+ logdir=None,
141
+ validate_every=100,
142
+ early_stop_patience=3):
143
+ self.to(device)
144
+
145
+ for param in self.bert.parameters():
146
+ param.requires_grad = False
147
+
148
+ best_val_loss = float('inf')
149
+ global_step = 0
150
+ epochs_no_improve = 0
151
+ best_model_state = None
152
+
153
+ if save_path is None:
154
+ save_path = f'output/{self.name}.pth'
155
+
156
+ if logdir is None:
157
+ logdir = f'runs/{self.name}'
158
+
159
+ for epoch in range(1, num_epochs + 1):
160
+ print(f"\nEpoch {epoch}/{num_epochs}")
161
+
162
+ total_loss = 0
163
+ correct = 0
164
+ total = 0
165
+
166
+ if epoch == 2:
167
+ self.unfrozen_layers = 4
168
+ self._rebuild_optimizer()
169
+
170
+ pbar = tqdm(train_loader, desc='Training')
171
+ for batch in pbar:
172
+ ids = batch['input_ids'].to(device)
173
+ mask = batch['attention_mask'].to(device)
174
+ types = batch['token_type_ids'].to(device)
175
+ labels = batch['label'].to(device)
176
+
177
+ logits = self(ids, mask, types)
178
+ loss = self.criterion(logits, labels)
179
+
180
+ self.optimizer.zero_grad()
181
+ loss.backward()
182
+ self.optimizer.step()
183
+
184
+ total_loss += loss.item()
185
+ preds = torch.argmax(logits, dim=1)
186
+ correct += (preds == labels).sum().item()
187
+ total += labels.size(0)
188
+ acc = correct / total
189
+
190
+ pbar.set_postfix({'loss': f'{loss.item():.4f}', 'acc': f'{acc:.4f}'})
191
+ global_step += 1
192
+
193
+ if global_step % validate_every == 0:
194
+ torch.cuda.empty_cache()
195
+ self.eval()
196
+ with torch.no_grad():
197
+ metrics = self.validate(val_loader, device)
198
+ val_loss, val_acc = metrics['loss'], metrics['acc']
199
+
200
+ self.scheduler.step(val_loss)
201
+
202
+
203
+ if val_loss < best_val_loss:
204
+ best_val_loss = val_loss
205
+ best_model_state = self.state_dict()
206
+ epochs_no_improve = 0
207
+ torch.save(best_model_state, save_path)
208
+ print(f"Saved best model (step {global_step}) with loss {best_val_loss:.4f}")
209
+ else:
210
+ epochs_no_improve += 1
211
+ print(f"No improvement for {epochs_no_improve} checks")
212
+
213
+ if epochs_no_improve >= early_stop_patience:
214
+ print(f"Early stopping triggered at step {global_step}!")
215
+ self.load_state_dict(best_model_state)
216
+ return
217
+
218
+
219
+ self.train()
220
+
221
+
222
+ def predict(self, texts, device='cpu'):
223
+ """Used for inference. Predicts the class of the input text.
224
+ Args:
225
+ texts (str or list of str): The input text(s) to classify, pass str.
226
+ - If a list is passed, the model will classify each text in the list as batch.
227
+ - If a single string is passed, the model will classify the text as a single instance.
228
+ - If a list of list is passed, the model will treate the first element as detected text and the second element as the context text.
229
+ device (str): The device to run the model on ('cpu', 'cuda', or 'mps'). If None, it will use the available device.
230
+ max_length (int): The maximum length of the input text.
231
+ Returns:
232
+ list: A list of dictionaries containing the prediction and probabilities for each input text.
233
+ Each dictionary contains:
234
+ - 'text': The input text.
235
+ - 'prediction': The predicted class (0 or 1).
236
+ - 'probabilities': The probabilities for each class.
237
+ """
238
+ if device is None:
239
+ if torch.cuda.is_available():
240
+ device = 'cuda'
241
+ elif torch.backends.mps.is_available():
242
+ device = 'mps'
243
+ else:
244
+ device = 'cpu'
245
+
246
+ self.eval()
247
+ self.to(device)
248
+
249
+
250
+ if isinstance(texts, str):
251
+ texts = [texts]
252
+ encoded_inputs = self.tokenizer(
253
+ texts,
254
+ padding=True,
255
+ truncation=True,
256
+ return_tensors="pt"
257
+ ).to(device)
258
+ elif isinstance(texts, list) and all(isinstance(item, list) for item in texts):
259
+ encoded_inputs = self.tokenizer(
260
+ [item[0] for item in texts],
261
+ [item[1] for item in texts],
262
+ padding=True,
263
+ truncation=True,
264
+ return_tensors="pt"
265
+ ).to(device)
266
+ elif isinstance(texts, list) and all(isinstance(item, str) for item in texts):
267
+ encoded_inputs = self.tokenizer(
268
+ texts,
269
+ padding=True,
270
+ truncation=True,
271
+ return_tensors="pt"
272
+ ).to(device)
273
+ else:
274
+ raise ValueError("Invalid input type. Expected str or list of str.")
275
+
276
+ input_ids = encoded_inputs['input_ids']
277
+ attention_mask = encoded_inputs['attention_mask']
278
+ token_type_ids = encoded_inputs.get('token_type_ids', None)
279
+
280
+ with torch.no_grad():
281
+ logits = self(input_ids, attention_mask, token_type_ids)
282
+ probs = torch.softmax(logits, dim=-1)
283
+ preds = torch.argmax(probs, dim=-1)
284
+
285
+ results = []
286
+ for i, text in enumerate(texts):
287
+ results.append({
288
+ 'text': text,
289
+ 'prediction': preds[i].item(),
290
+ 'probabilities': probs[i].cpu().tolist()
291
+ })
292
+ return results
cold/dynamic_conv.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch.nn.functional as F
3
+
4
+ class DynamicConv1d(nn.Module):
5
+ def __init__(self, in_channels, out_channels, kernel_size, K=4, reduction=4):
6
+ super().__init__()
7
+ self.K = K
8
+ self.convs = nn.ModuleList([
9
+ nn.Conv1d(in_channels, out_channels, kernel_size,
10
+ padding=kernel_size//2)
11
+ for _ in range(K)
12
+ ])
13
+ # self.attn = nn.Sequential(
14
+ # nn.AdaptiveAvgPool2d(1),
15
+ # nn.Conv2d(in_channels, max(in_channels // reduction, 1), 1),
16
+ # nn.ReLU(inplace=True),
17
+ # nn.Conv2d(max(in_channels // reduction, 1), max(in_channels // reduction, 1), 1),
18
+ # nn.ReLU(inplace=True),
19
+ # nn.Conv2d(max(in_channels // reduction, 1), max(K,1), 1)
20
+ # )
21
+ self.attn = nn.Sequential(
22
+ nn.AdaptiveAvgPool1d(1),
23
+ nn.Conv1d(in_channels, max(in_channels // reduction, 1), 1),
24
+ nn.SiLU(),
25
+ nn.Conv1d(max(in_channels // reduction, 1), K, 1)
26
+ )
27
+ nn.init.zeros_(self.attn[-1].weight)
28
+
29
+ def forward(self, x):
30
+ x = x.permute(0, 2, 1)
31
+ attn_logits = self.attn(x)
32
+ attn_weights = F.softmax(attn_logits, dim=1)
33
+ conv_outs = [conv(x) for conv in self.convs]
34
+
35
+ out = sum(w * o for w, o in zip(attn_weights.split(1, dim=1), conv_outs))
36
+ return out
cold/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
cold/text_cnn.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from .dynamic_conv import DynamicConv1d
5
+
6
+ class DynamicTextCNN(nn.Module):
7
+ def __init__(self, input_dim, num_filters, filter_sizes, K=4, dropout=0.1):
8
+ super().__init__()
9
+ self.convs = nn.ModuleList([
10
+ DynamicConv1d(input_dim, num_filters, k, K)
11
+ for k in filter_sizes
12
+ ])
13
+ self.layer_norm = nn.LayerNorm(len(filter_sizes) * num_filters)
14
+ self.dropout = nn.Dropout(dropout)
15
+
16
+ def forward(self, x):
17
+
18
+ convs = [F.relu(conv(x)) for conv in self.convs]
19
+
20
+ pools = [F.adaptive_max_pool1d(c, 1).squeeze(-1) for c in convs]
21
+
22
+ features = torch.cat(pools, dim=1)
23
+ features = self.layer_norm(features)
24
+
25
+ return self.dropout(features)