Jet-12138 commited on
Commit
018d244
·
verified ·
1 Parent(s): 5a7af5f

Upload 7 files

Browse files
Files changed (7) hide show
  1. README.md +103 -10
  2. __init__.py +1 -0
  3. app.py +59 -15
  4. config.json +15 -0
  5. model.py +66 -0
  6. requirements.txt +2 -2
  7. tokenizer_config.json +6 -0
README.md CHANGED
@@ -1,13 +1,106 @@
1
  ---
2
- title: CommentResponse
3
- emoji: 🔥
4
- colorFrom: gray
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.26.0
8
- app_file: app.py
9
- pinned: false
10
- license: other
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language: en
3
+ license: mit
4
+ datasets:
5
+ - toxic_comment_classification
6
+ tags:
7
+ - text-classification
8
+ - toxicity-detection
9
+ - sentiment-analysis
10
+ - multi-task-learning
11
+ pipeline_tag: text-classification
12
  ---
13
 
14
+ # Comment MTL BERT Model
15
+
16
+ This is a BERT-based multi-task learning model capable of performing sentiment analysis and toxicity detection simultaneously.
17
+
18
+ ## Model Architecture
19
+
20
+ The model is based on the `bert-base-uncased` pre-trained model with two separate classification heads:
21
+ - **Sentiment Analysis Head**: 3-class classification (Negative, Neutral, Positive)
22
+ - **Toxicity Detection Head**: 6-class multi-label classification (toxic, severe_toxic, obscene, threat, insult, identity_hate)
23
+
24
+ ### Technical Parameters
25
+
26
+ - Hidden size: 768
27
+ - Number of attention heads: 12
28
+ - Number of hidden layers: 12
29
+ - Vocabulary size: 30522
30
+ - Maximum position embeddings: 512
31
+ - Hidden activation function: gelu
32
+ - Dropout probability: 0.1
33
+
34
+ ## Usage
35
+
36
+ ### Loading the Model
37
+
38
+ ```python
39
+ from transformers import AutoTokenizer
40
+ from src.model import CommentMTLModel
41
+ import torch
42
+
43
+ # Load tokenizer
44
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
45
+
46
+ # Load model
47
+ model = CommentMTLModel(
48
+ model_name="bert-base-uncased",
49
+ num_sentiment_labels=3,
50
+ num_toxicity_labels=6
51
+ )
52
+
53
+ # Load pre-trained weights
54
+ state_dict = torch.load("model.bin", map_location=torch.device('cpu'))
55
+ model.load_state_dict(state_dict)
56
+ model.eval()
57
+ ```
58
+
59
+ ### Model Inference
60
+
61
+ ```python
62
+ # Prepare input
63
+ text = "This is a test comment."
64
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
65
+
66
+ # Model inference
67
+ with torch.no_grad():
68
+ outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
69
+
70
+ # Get results
71
+ sentiment_logits = outputs["sentiment_logits"]
72
+ toxicity_logits = outputs["toxicity_logits"]
73
+
74
+ # Process sentiment analysis results
75
+ sentiment_probs = torch.softmax(sentiment_logits, dim=1)
76
+ sentiment_labels = {0: "Negative", 1: "Neutral", 2: "Positive"}
77
+ sentiment_prediction = sentiment_labels[sentiment_probs.argmax().item()]
78
+
79
+ # Process toxicity detection results
80
+ toxicity_probs = torch.sigmoid(toxicity_logits)
81
+ toxicity_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
82
+ toxicity_results = {label: prob.item() for label, prob in zip(toxicity_cols, toxicity_probs[0])}
83
+
84
+ print(f"Sentiment: {sentiment_prediction}")
85
+ print(f"Toxicity probabilities: {toxicity_results}")
86
+ ```
87
+
88
+ ## Limitations
89
+
90
+ - This model was trained on English data only and is not suitable for other languages.
91
+ - The toxicity detection may produce false positives or negatives in edge cases.
92
+ - The model may lose information when processing long texts as the maximum input length is limited to 128 tokens.
93
+
94
+ ## Citation
95
+
96
+ If you use this model, please cite our repository:
97
+
98
+ ```
99
+ @misc{comment-mtl-bert,
100
+ author = {Aseem},
101
+ title = {Comment MTL BERT: Multi-Task Learning for Comment Analysis},
102
+ year = {2023},
103
+ publisher = {GitHub},
104
+ url = {https://huggingface.co/Aseemks07/comment_mtl_bert_best}
105
+ }
106
+ ```
__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # src package
app.py CHANGED
@@ -1,20 +1,64 @@
 
 
 
1
  import gradio as gr
2
- from transformers import pipeline
3
 
4
- # 加载你的模型
5
- classifier = pipeline("text-classification", model="你的用户名/你的模型名")
6
 
7
- # 定义推理函数
8
- def classify(text):
9
- outputs = classifier(text)
10
- # 可以只返回预测标签
11
- return outputs[0]["label"]
 
 
12
 
13
- # 创建Gradio界面
14
- iface = gr.Interface(fn=classify,
15
- inputs=gr.Textbox(lines=2, placeholder="输入文本..."),
16
- outputs="text",
17
- title="文本分类模型",
18
- description="请输入一段文本,我来帮你分类!")
19
 
20
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from transformers import BertTokenizer
4
  import gradio as gr
 
5
 
6
+ from model import CommentClassificationModel # 导入你自定义的模型
 
7
 
8
+ # Set device, including MPS support for Mac
9
+ if torch.backends.mps.is_available():
10
+ device = torch.device("mps")
11
+ elif torch.cuda.is_available():
12
+ device = torch.device("cuda")
13
+ else:
14
+ device = torch.device("cpu")
15
 
16
+ # Load tokenizer
17
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 
 
 
 
18
 
19
+ # Load model
20
+ model = CommentClassificationModel(config_path="config.json")
21
+ model.load_state_dict(torch.load("pytorch_model.bin", map_location=device))
22
+ model.to(device)
23
+ model.eval()
24
+
25
+ # Define labels
26
+ sentiment_labels = ["Negative", "Neutral", "Positive"]
27
+ toxicity_labels = ["Toxic", "Severe Toxic", "Obscene", "Threat", "Insult", "Identity Hate"]
28
+
29
+ # Define the prediction function
30
+ def analyse_comment(comment):
31
+ inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
32
+ inputs = {k: v.to(device) for k, v in inputs.items()}
33
+
34
+ with torch.no_grad():
35
+ sentiment_logits, toxicity_logits = model(**inputs)
36
+
37
+ # Process sentiment
38
+ sentiment_probs = F.softmax(sentiment_logits, dim=1)
39
+ sentiment_idx = torch.argmax(sentiment_probs, dim=1).item()
40
+ sentiment_prediction = sentiment_labels[sentiment_idx]
41
+
42
+ # Process toxicity
43
+ toxicity_probs = F.softmax(toxicity_logits, dim=1)
44
+ toxicity_idx = torch.argmax(toxicity_probs, dim=1).item()
45
+ toxicity_prediction = toxicity_labels[toxicity_idx]
46
+
47
+ return {
48
+ "Sentiment": sentiment_prediction,
49
+ "Toxicity": toxicity_prediction
50
+ }
51
+
52
+ # Create Gradio interface
53
+ iface = gr.Interface(
54
+ fn=analyse_comment,
55
+ inputs=gr.Textbox(lines=3, placeholder="Please enter a comment for analysis..."),
56
+ outputs=[
57
+ gr.Label(num_top_classes=1, label="Predicted Sentiment"),
58
+ gr.Label(num_top_classes=1, label="Predicted Toxicity")
59
+ ],
60
+ title="Comment Sentiment and Toxicity Classifier",
61
+ description="This tool classifies the sentiment and the most probable type of toxicity in a given comment. It utilises a custom fine-tuned BERT model. Developed for academic demonstration purposes in Australia."
62
+ )
63
+
64
+ iface.launch()
config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "comment_mtl_bert",
3
+ "pretrained_model_name": "bert-base-uncased",
4
+ "hidden_size": 768,
5
+ "num_attention_heads": 12,
6
+ "num_hidden_layers": 12,
7
+ "vocab_size": 30522,
8
+ "max_position_embeddings": 512,
9
+ "hidden_act": "gelu",
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-12,
12
+ "dropout_prob": 0.1,
13
+ "num_sentiment_labels": 3,
14
+ "num_toxicity_labels": 6
15
+ }
model.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import BertModel, AutoModel
4
+
5
+ class CommentMTLModel(nn.Module):
6
+ """
7
+ Multi-Task Learning model using a BERT base and separate heads for
8
+ sentiment classification and toxicity multi-label classification.
9
+ """
10
+ def __init__(self, model_name, num_sentiment_labels, num_toxicity_labels, dropout_prob=0.1):
11
+ """
12
+ Args:
13
+ model_name (str): Name of the pre-trained BERT model from Hugging Face.
14
+ num_sentiment_labels (int): Number of classes for sentiment analysis.
15
+ num_toxicity_labels (int): Number of classes for toxicity detection.
16
+ dropout_prob (float): Dropout probability for the classification heads.
17
+ """
18
+ super(CommentMTLModel, self).__init__()
19
+
20
+ # Load the pre-trained BERT model
21
+ self.bert = AutoModel.from_pretrained(model_name)
22
+
23
+ # Dropout layer for regularization - applied after BERT output, before heads
24
+ self.dropout = nn.Dropout(dropout_prob)
25
+
26
+ # --- Sentiment Head ---
27
+ # Takes BERT's pooled output (for [CLS] token) and maps it to sentiment logits
28
+ self.sentiment_classifier = nn.Linear(self.bert.config.hidden_size, num_sentiment_labels)
29
+
30
+ # --- Toxicity Head ---
31
+ # Takes BERT's pooled output and maps it to toxicity logits (multi-label)
32
+ self.toxicity_classifier = nn.Linear(self.bert.config.hidden_size, num_toxicity_labels)
33
+
34
+ def forward(self, input_ids, attention_mask):
35
+ """
36
+ Forward pass of the model.
37
+
38
+ Args:
39
+ input_ids (torch.Tensor): Tensor of input token IDs (batch_size, seq_length).
40
+ attention_mask (torch.Tensor): Tensor of attention masks (batch_size, seq_length).
41
+
42
+ Returns:
43
+ dict: A dictionary containing the raw output logits for each task:
44
+ 'sentiment_logits': Logits for sentiment classification (batch_size, num_sentiment_labels).
45
+ 'toxicity_logits': Logits for toxicity multi-label classification (batch_size, num_toxicity_labels).
46
+ """
47
+ # Pass input through BERT model
48
+ outputs = self.bert(
49
+ input_ids=input_ids,
50
+ attention_mask=attention_mask
51
+ )
52
+
53
+ # Get the pooled output
54
+ pooled_output = outputs.pooler_output
55
+
56
+ # Apply dropout for regularization
57
+ pooled_output = self.dropout(pooled_output)
58
+
59
+ # Pass the pooled output through the task-specific heads
60
+ sentiment_logits = self.sentiment_classifier(pooled_output)
61
+ toxicity_logits = self.toxicity_classifier(pooled_output)
62
+
63
+ return {
64
+ 'sentiment_logits': sentiment_logits,
65
+ 'toxicity_logits': toxicity_logits
66
+ }
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- transformers
2
- torch
3
  gradio
 
 
 
 
 
1
  gradio
2
+ torch>=1.10.0
3
+ transformers>=4.18.0
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "bert",
3
+ "do_lower_case": true,
4
+ "tokenizer_class": "BertTokenizer",
5
+ "name_or_path": "bert-base-uncased"
6
+ }