Spaces:

Albert-CAC
/

lite_DETECTIVE

Sleeping

App Files Files Community

AlbertCAC commited on May 17, 2025

Commit

3c27def

0 Parent(s):

app

Browse files

Files changed (17) hide show

DOCKERFILE +16 -0
cold/__init__.py +13 -0
cold/__main__.py +20 -0
cold/__pycache__/__init__.cpython-310.pyc +0 -0
cold/__pycache__/__init__.cpython-312.pyc +0 -0
cold/__pycache__/__main__.cpython-312.pyc +0 -0
cold/__pycache__/classifier.cpython-310.pyc +0 -0
cold/__pycache__/classifier.cpython-312.pyc +0 -0
cold/__pycache__/dynamic_conv.cpython-310.pyc +0 -0
cold/__pycache__/dynamic_conv.cpython-312.pyc +0 -0
cold/__pycache__/predict.cpython-312.pyc +0 -0
cold/__pycache__/text_cnn.cpython-310.pyc +0 -0
cold/__pycache__/text_cnn.cpython-312.pyc +0 -0
cold/classifier.py +292 -0
cold/dynamic_conv.py +36 -0
cold/requirements.txt +2 -0
cold/text_cnn.py +25 -0

DOCKERFILE ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "lite_DETECTIVE.app:app", "--host", "0.0.0.0", "--port", "7860"]

cold/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+LiteDetective - Malicious Content Detection Pipeline
+Copyright (c) 2025 Albert Zhao
+Author: Albert Zhao Zhaoq@kean.edu Hu Mingcheng
+Created: 2025-05-11
+Updated: 2025-05-11
+Description:
+    Package containing model implementations.
+License: MIT License
+"""

cold/__main__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import argparse
+from .classifier import ToxicTextClassifier
+def getArgs():
+    parser = argparse.ArgumentParser(description="LiteDetective - Malicious Content Detection Pipeline")
+    parser.add_argument("--path", type=str, default="output/cold.pth", required=False, help="Path to the model")
+    parser.add_argument("--device", type=str, default="cpu", required=False, help="Device to use (cpu, mps, or cuda)")
+    parser.add_argument("args", nargs='+', help="the text to detect")
+    args = parser.parse_args()
+    return args
+def main():
+    args = getArgs()
+    model = ToxicTextClassifier(path=args.path)
+    result = model.predict(args.args, device=args.device)
+    print(result)
+if __name__ == "__main__":
+    main()

cold/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (434 Bytes). View file

cold/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (443 Bytes). View file

cold/__pycache__/__main__.cpython-312.pyc ADDED Viewed

Binary file (1.42 kB). View file

cold/__pycache__/classifier.cpython-310.pyc ADDED Viewed

Binary file (8.21 kB). View file

cold/__pycache__/classifier.cpython-312.pyc ADDED Viewed

Binary file (14.4 kB). View file

cold/__pycache__/dynamic_conv.cpython-310.pyc ADDED Viewed

Binary file (1.8 kB). View file

cold/__pycache__/dynamic_conv.cpython-312.pyc ADDED Viewed

Binary file (2.53 kB). View file

cold/__pycache__/predict.cpython-312.pyc ADDED Viewed

Binary file (1.42 kB). View file

cold/__pycache__/text_cnn.cpython-310.pyc ADDED Viewed

Binary file (1.63 kB). View file

cold/__pycache__/text_cnn.cpython-312.pyc ADDED Viewed

Binary file (1.97 kB). View file

cold/classifier.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import torch
+import torch.nn as nn
+from transformers import BertModel, BertTokenizer
+from torch.optim import AdamW, lr_scheduler
+from .text_cnn import DynamicTextCNN
+from sklearn.metrics import classification_report, confusion_matrix
+from tqdm import tqdm
+import os
+class ToxicTextClassifier(nn.Module):
+    def __init__(self,
+                 bert_name='hfl/chinese-roberta-wwm-ext',
+                 num_filters=128,
+                 filter_sizes=(1,2,3,4),
+                 K=4,
+                 fc_dim=128,
+                 num_classes=2,
+                 dropout=0.1,
+                 name='lite',
+                 path=None,
+            ):
+        super().__init__()
+        self.tokenizer = BertTokenizer.from_pretrained(bert_name,from_tf=True)
+        self.bert = BertModel.from_pretrained(bert_name)
+        self.name = name
+        self.unfrozen_layers = 0
+        hidden_size = self.bert.config.hidden_size * 2
+        os.makedirs(f'data/{name}', exist_ok=True)
+        self.text_cnn = DynamicTextCNN(hidden_size, num_filters, filter_sizes, K, dropout)
+        input_dim = len(filter_sizes) * num_filters
+        self.classifier = nn.Sequential(
+            nn.Linear(input_dim, fc_dim),
+            nn.ReLU(),
+            nn.LayerNorm(fc_dim),
+            nn.Dropout(dropout),
+            nn.Linear(fc_dim, fc_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(fc_dim // 2, num_classes)
+        )
+        self.criterion = nn.CrossEntropyLoss()
+        self._rebuild_optimizer()
+        if path is None:
+            path = f'output/{name}.pth'
+            if os.path.exists(path):
+                self.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
+                print(f"Model loaded from {path}")
+            else:
+                raise FileNotFoundError(f"You moved the default model path, we did not find it.")
+        if os.path.exists(path):
+            self.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
+            print(f"Model loaded from {path}")
+        else:
+            raise FileNotFoundError(f"Model file {path} not found.")
+    def _rebuild_optimizer(self):
+        """@deprecated
+        """
+        param_groups = [
+            {'params': self.text_cnn.parameters(),   'lr': 1e-4},
+            {'params': self.classifier.parameters(), 'lr': 1e-4},
+        ]
+        if self.unfrozen_layers > 0:
+            layers = self.bert.encoder.layer[-self.unfrozen_layers:]
+            bert_params = []
+            for layer in layers:
+                for p in layer.parameters():
+                    p.requires_grad = True
+                    bert_params.append(p)
+            param_groups.append({'params': bert_params, 'lr': 2e-5})
+        self.optimizer = AdamW(param_groups, weight_decay=0.01)
+        self.scheduler = lr_scheduler.ReduceLROnPlateau(
+            self.optimizer,
+            mode='min',
+            factor=0.5,
+            patience=2,
+        )
+    def forward(self, input_ids, attention_mask, token_type_ids=None):
+        bert_out = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=True,
+        )
+        hidden = torch.cat(bert_out.hidden_states[-2:], dim=-1)
+        feat = self.text_cnn(hidden)
+        return self.classifier(feat)
+    def validate(self, val_loader, device):
+        self.eval()
+        val_loss = 0
+        correct = 0
+        total = 0
+        all_preds = []
+        all_labels = []
+        with torch.no_grad():
+            pbar = tqdm(val_loader, desc='Validating')
+            for batch in pbar:
+                ids = batch['input_ids'].to(device)
+                mask = batch['attention_mask'].to(device)
+                types = batch['token_type_ids'].to(device)
+                labels = batch['label'].to(device)
+                logits = self(ids, mask, types)
+                loss = self.criterion(logits, labels)
+                val_loss += loss.item()
+                preds = torch.argmax(logits, dim=1)
+                correct += (preds == labels).sum().item()
+                total += labels.size(0)
+                all_preds.extend(preds.cpu().tolist())
+                all_labels.extend(labels.cpu().tolist())
+                pbar.set_postfix({'loss': f'{loss.item():.4f}'})
+        epoch_acc = correct / total if total > 0 else 0
+        metrics = {
+            'loss': val_loss / len(val_loader),
+            'acc': epoch_acc,
+            'report': classification_report(all_labels, all_preds, target_names=['non-toxic','toxic']),
+            'confusion_matrix': confusion_matrix(all_labels, all_preds)
+        }
+        torch.cuda.empty_cache()
+        return metrics
+    def train_model(self, train_loader, val_loader,
+                    num_epochs=3, device='cpu',
+                    save_path=None,
+                    logdir=None,
+                    validate_every=100,
+                    early_stop_patience=3):
+        self.to(device)
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        best_val_loss = float('inf')
+        global_step = 0
+        epochs_no_improve = 0
+        best_model_state = None
+        if save_path is None:
+            save_path = f'output/{self.name}.pth'
+        if logdir is None:
+            logdir = f'runs/{self.name}'
+        for epoch in range(1, num_epochs + 1):
+            print(f"\nEpoch {epoch}/{num_epochs}")
+            total_loss = 0
+            correct = 0
+            total = 0
+            if epoch == 2:
+                self.unfrozen_layers = 4
+                self._rebuild_optimizer()
+            pbar = tqdm(train_loader, desc='Training')
+            for batch in pbar:
+                ids = batch['input_ids'].to(device)
+                mask = batch['attention_mask'].to(device)
+                types = batch['token_type_ids'].to(device)
+                labels = batch['label'].to(device)
+                logits = self(ids, mask, types)
+                loss = self.criterion(logits, labels)
+                self.optimizer.zero_grad()
+                loss.backward()
+                self.optimizer.step()
+                total_loss += loss.item()
+                preds = torch.argmax(logits, dim=1)
+                correct += (preds == labels).sum().item()
+                total += labels.size(0)
+                acc = correct / total
+                pbar.set_postfix({'loss': f'{loss.item():.4f}', 'acc': f'{acc:.4f}'})
+                global_step += 1
+                if global_step % validate_every == 0:
+                    torch.cuda.empty_cache()
+                    self.eval()
+                    with torch.no_grad():
+                        metrics = self.validate(val_loader, device)
+                    val_loss, val_acc = metrics['loss'], metrics['acc']
+                    self.scheduler.step(val_loss)
+                    if val_loss < best_val_loss:
+                        best_val_loss = val_loss
+                        best_model_state = self.state_dict()
+                        epochs_no_improve = 0
+                        torch.save(best_model_state, save_path)
+                        print(f"Saved best model (step {global_step}) with loss {best_val_loss:.4f}")
+                    else:
+                        epochs_no_improve += 1
+                        print(f"No improvement for {epochs_no_improve} checks")
+                        if epochs_no_improve >= early_stop_patience:
+                            print(f"Early stopping triggered at step {global_step}!")
+                            self.load_state_dict(best_model_state)
+                            return
+                    self.train()
+    def predict(self, texts, device='cpu'):
+        """Used for inference. Predicts the class of the input text.
+        Args:
+            texts (str or list of str): The input text(s) to classify, pass str.
+                - If a list is passed, the model will classify each text in the list as batch.
+                - If a single string is passed, the model will classify the text as a single instance.
+                - If a list of list is passed, the model will treate the first element as detected text and the second element as the context text.
+            device (str): The device to run the model on ('cpu', 'cuda', or 'mps'). If None, it will use the available device.
+            max_length (int): The maximum length of the input text.
+        Returns:
+            list: A list of dictionaries containing the prediction and probabilities for each input text.
+                Each dictionary contains:
+                    - 'text': The input text.
+                    - 'prediction': The predicted class (0 or 1).
+                    - 'probabilities': The probabilities for each class.
+    """
+        if device is None:
+            if torch.cuda.is_available():
+                device = 'cuda'
+            elif torch.backends.mps.is_available():
+                device = 'mps'
+            else:
+                device = 'cpu'
+        self.eval()
+        self.to(device)
+        if isinstance(texts, str):
+            texts = [texts]
+            encoded_inputs = self.tokenizer(
+                texts,
+                padding=True,
+                truncation=True,
+                return_tensors="pt"
+            ).to(device)
+        elif isinstance(texts, list) and all(isinstance(item, list) for item in texts):
+            encoded_inputs = self.tokenizer(
+                [item[0] for item in texts],
+                [item[1] for item in texts],
+                padding=True,
+                truncation=True,
+                return_tensors="pt"
+            ).to(device)
+        elif isinstance(texts, list) and all(isinstance(item, str) for item in texts):
+            encoded_inputs = self.tokenizer(
+                texts,
+                padding=True,
+                truncation=True,
+                return_tensors="pt"
+            ).to(device)
+        else:
+            raise ValueError("Invalid input type. Expected str or list of str.")
+        input_ids = encoded_inputs['input_ids']
+        attention_mask = encoded_inputs['attention_mask']
+        token_type_ids = encoded_inputs.get('token_type_ids', None)
+        with torch.no_grad():
+            logits = self(input_ids, attention_mask, token_type_ids)
+            probs = torch.softmax(logits, dim=-1)
+            preds = torch.argmax(probs, dim=-1)
+        results = []
+        for i, text in enumerate(texts):
+            results.append({
+                'text': text,
+                'prediction': preds[i].item(),
+                'probabilities': probs[i].cpu().tolist()
+            })
+        return results

cold/dynamic_conv.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch.nn as nn
+import torch.nn.functional as F
+class DynamicConv1d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, K=4, reduction=4):
+        super().__init__()
+        self.K = K
+        self.convs = nn.ModuleList([
+            nn.Conv1d(in_channels, out_channels, kernel_size,
+                      padding=kernel_size//2)
+            for _ in range(K)
+        ])
+        # self.attn = nn.Sequential(
+        #     nn.AdaptiveAvgPool2d(1),
+        #     nn.Conv2d(in_channels, max(in_channels // reduction, 1), 1),
+        #     nn.ReLU(inplace=True),
+        #     nn.Conv2d(max(in_channels // reduction, 1), max(in_channels // reduction, 1), 1),
+        #     nn.ReLU(inplace=True),
+        #     nn.Conv2d(max(in_channels // reduction, 1), max(K,1), 1)
+        # )
+        self.attn = nn.Sequential(
+            nn.AdaptiveAvgPool1d(1),
+            nn.Conv1d(in_channels,  max(in_channels // reduction, 1), 1),
+            nn.SiLU(),
+            nn.Conv1d(max(in_channels // reduction, 1), K, 1)
+        )
+        nn.init.zeros_(self.attn[-1].weight)
+    def forward(self, x):
+        x = x.permute(0, 2, 1)
+        attn_logits = self.attn(x)
+        attn_weights = F.softmax(attn_logits, dim=1)
+        conv_outs = [conv(x) for conv in self.convs]
+        out = sum(w * o for w, o in zip(attn_weights.split(1, dim=1), conv_outs))
+        return out

cold/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fastapi
2	+ uvicorn[standard]

cold/text_cnn.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+from .dynamic_conv import DynamicConv1d
+class DynamicTextCNN(nn.Module):
+    def __init__(self, input_dim, num_filters, filter_sizes, K=4, dropout=0.1):
+        super().__init__()
+        self.convs = nn.ModuleList([
+            DynamicConv1d(input_dim, num_filters, k, K)
+            for k in filter_sizes
+        ])
+        self.layer_norm = nn.LayerNorm(len(filter_sizes) * num_filters)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        convs = [F.relu(conv(x)) for conv in self.convs]
+        pools = [F.adaptive_max_pool1d(c, 1).squeeze(-1) for c in convs]
+        features = torch.cat(pools, dim=1)
+        features = self.layer_norm(features)
+        return self.dropout(features)