CoEdd commited on
Commit
e834ba4
·
1 Parent(s): aa57927

Track src/train.csv with Git LFS

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ src/train.csv filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ matplotlib
4
+ seaborn
5
+ scikit-learn
6
+ torch
7
+ transformers
8
+ datasets
9
+ gradio
10
+ ftfy
11
+ accelerate>=0.26.0
12
+ flask
src/__pycache__/model.cpython-312.pyc ADDED
Binary file (7.67 kB). View file
 
src/app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import gradio as gr
3
+ from model import ToxicCommentDetector
4
+
5
+ app = Flask(__name__)
6
+ detector = ToxicCommentDetector()
7
+ detector.load_models()
8
+
9
+ @app.route('/predict', methods=['POST'])
10
+ def predict():
11
+ data = request.json
12
+ text = data.get('text', '')
13
+ model_name = data.get('model_name', 'DistilBERT')
14
+
15
+ if not text:
16
+ return jsonify({"error": "No text provided"}), 400
17
+
18
+ try:
19
+ results = detector.predict(text, model_name)
20
+ return jsonify(results)
21
+ except Exception as e:
22
+ return jsonify({"error": str(e)}), 500
23
+
24
+ def create_gradio_interface(detector):
25
+ def predict_toxicity(text, model_name):
26
+ if not text.strip():
27
+ return "Please enter some text to analyze."
28
+ try:
29
+ results = detector.predict(text, model_name)
30
+ output = f"🔍 **Analysis Results using {model_name}:**\n\n"
31
+ for label, score in results.items():
32
+ emoji = "🚨" if score > 0.5 else "✅"
33
+ output += f"{emoji} **{label.replace('_', ' ').title()}**: {score:.3f} ({score*100:.1f}%)\n"
34
+ return output
35
+ except Exception as e:
36
+ return f"Error: {str(e)}"
37
+
38
+ with gr.Blocks(title="🛡️ Toxic Comment Detector", theme=gr.themes.Soft()) as interface:
39
+ gr.Markdown("""
40
+ # 🛡️ Toxic Comment Detector
41
+ This app uses three different pre-trained models to detect toxicity in comments.
42
+ Enter your text below and choose a model to get predictions, or compare all models at once!
43
+ """)
44
+
45
+ with gr.Tab("Single Model Prediction"):
46
+ with gr.Row():
47
+ with gr.Column():
48
+ text_input = gr.Textbox(label="Enter comment to analyze", placeholder="Type your comment here...", lines=3)
49
+ model_dropdown = gr.Dropdown(choices=list(detector.models.keys()), label="Select Model", value=list(detector.models.keys())[0])
50
+ predict_btn = gr.Button("🔍 Analyze Toxicity", variant="primary")
51
+
52
+ with gr.Column():
53
+ single_output = gr.Markdown(label="Results")
54
+
55
+ predict_btn.click(predict_toxicity, inputs=[text_input, model_dropdown], outputs=single_output)
56
+
57
+ return interface
58
+
59
+ if __name__ == "__main__":
60
+ interface = create_gradio_interface(detector)
61
+ interface.launch()
src/model.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
3
+ import torch
4
+
5
+ class ToxicCommentDetector:
6
+ def __init__(self):
7
+ # Initialize empty dictionaries for models and tokenizers
8
+ self.models = {}
9
+ self.tokenizers = {}
10
+ self.label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
11
+
12
+ self.model_configs = {
13
+ 'DistilBERT': {
14
+ 'name': 'distilbert-base-uncased',
15
+ 'max_len': 128,
16
+ 'batch_size': 16,
17
+ 'epochs': 3,
18
+ 'lr': 2e-5
19
+ },
20
+ 'RoBERTa': {
21
+ 'name': 'roberta-base',
22
+ 'max_len': 128,
23
+ 'batch_size': 8,
24
+ 'epochs': 3,
25
+ 'lr': 1e-5
26
+ },
27
+ 'ALBERT': {
28
+ 'name': 'albert-base-v2',
29
+ 'max_len': 128,
30
+ 'batch_size': 16,
31
+ 'epochs': 3,
32
+ 'lr': 3e-5
33
+ }
34
+ }
35
+
36
+ def load_models(self):
37
+ """Load pre-trained models and tokenizers."""
38
+ for model_name, config in self.model_configs.items():
39
+ print(f"Loading {model_name}...")
40
+ self.models[model_name] = AutoModelForSequenceClassification.from_pretrained(config['name'], num_labels=len(self.label_columns))
41
+ self.tokenizers[model_name] = AutoTokenizer.from_pretrained(config['name'])
42
+ print("✅ Models and tokenizers loaded successfully!")
43
+
44
+ def load_and_preprocess_data(self, file_path):
45
+ """Load and preprocess the dataset."""
46
+ print(f"📊 Loading dataset from {file_path}...")
47
+ df = pd.read_csv(file_path)
48
+ print(f"✅ Dataset loaded successfully! First few rows:\n{df.head()}")
49
+
50
+ # Preprocess the data
51
+ from preprocess import preprocess_data
52
+ df = preprocess_data(df)
53
+ print("✅ Data preprocessing completed!")
54
+ return df
55
+
56
+ def train_model(self, model_name, X_train, X_val, y_train, y_val):
57
+ print(f"\n🚀 Training {model_name}...")
58
+
59
+ config = self.model_configs[model_name]
60
+
61
+ tokenizer = AutoTokenizer.from_pretrained(config['name'])
62
+ model = AutoModelForSequenceClassification.from_pretrained(
63
+ config['name'],
64
+ num_labels=len(self.label_columns),
65
+ problem_type="multi_label_classification"
66
+ )
67
+
68
+ train_dataset = ToxicDataset(X_train, y_train, tokenizer, config['max_len'], model_name)
69
+ val_dataset = ToxicDataset(X_val, y_val, tokenizer, config['max_len'], model_name)
70
+
71
+ training_args = TrainingArguments(
72
+ output_dir=f'./results_{model_name.lower()}',
73
+ num_train_epochs=config['epochs'],
74
+ per_device_train_batch_size=config['batch_size'],
75
+ per_device_eval_batch_size=config['batch_size'],
76
+ warmup_steps=500,
77
+ weight_decay=0.01,
78
+ logging_dir=f'./logs_{model_name.lower()}',
79
+ logging_steps=100,
80
+ eval_strategy="steps",
81
+ eval_steps=500,
82
+ save_strategy="steps",
83
+ save_steps=500,
84
+ load_best_model_at_end=True,
85
+ metric_for_best_model="auc",
86
+ greater_is_better=True,
87
+ learning_rate=config['lr'],
88
+ adam_epsilon=1e-8,
89
+ max_grad_norm=1.0,
90
+ fp16=True if torch.cuda.is_available() else False,
91
+ dataloader_num_workers=0,
92
+ save_total_limit=1,
93
+ )
94
+
95
+ trainer = Trainer(
96
+ model=model,
97
+ args=training_args,
98
+ train_dataset=train_dataset,
99
+ eval_dataset=val_dataset,
100
+ compute_metrics=compute_metrics,
101
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
102
+ )
103
+
104
+ trainer.train()
105
+
106
+ self.models[model_name] = model
107
+ self.tokenizers[model_name] = tokenizer
108
+
109
+ eval_results = trainer.evaluate()
110
+ print(f"✅ {model_name} - Validation AUC: {eval_results['eval_auc']:.4f}, F1: {eval_results['eval_f1']:.4f}")
111
+
112
+ return eval_results
113
+
114
+ def predict(self, text, model_name):
115
+ if model_name not in self.models:
116
+ raise ValueError(f"Model {model_name} not trained yet!")
117
+
118
+ model = self.models[model_name]
119
+ tokenizer = self.tokenizers[model_name]
120
+
121
+ device = next(model.parameters()).device
122
+
123
+ tokenizer_kwargs = {
124
+ 'text': text,
125
+ 'add_special_tokens': True,
126
+ 'max_length': 128,
127
+ 'padding': 'max_length',
128
+ 'truncation': True,
129
+ 'return_attention_mask': True,
130
+ 'return_tensors': 'pt'
131
+ }
132
+
133
+ if 'distilbert' not in model_name.lower():
134
+ tokenizer_kwargs['return_token_type_ids'] = True
135
+
136
+ inputs = tokenizer.encode_plus(**tokenizer_kwargs)
137
+
138
+ for key in inputs:
139
+ inputs[key] = inputs[key].to(device)
140
+
141
+ model.eval()
142
+ with torch.no_grad():
143
+ outputs = model(**inputs)
144
+ predictions = torch.sigmoid(outputs.logits).cpu().numpy()[0]
145
+
146
+ results = {}
147
+ for i, label in enumerate(self.label_columns):
148
+ results[label] = float(predictions[i])
149
+
150
+ return results
151
+
152
+ def evaluate_all_models(self, X_test, y_test):
153
+ results = {}
154
+
155
+ for model_name in self.models.keys():
156
+ print(f"\n🔍 Evaluating {model_name} on test set...")
157
+
158
+ model = self.models[model_name]
159
+ tokenizer = self.tokenizers[model_name]
160
+
161
+ test_dataset = ToxicDataset(X_test, y_test, tokenizer, 128, model_name)
162
+
163
+ trainer = Trainer(
164
+ model=model,
165
+ compute_metrics=compute_metrics,
166
+ )
167
+
168
+ eval_results = trainer.evaluate(test_dataset)
169
+ results[model_name] = {
170
+ 'auc': eval_results['eval_auc'],
171
+ 'f1': eval_results['eval_f1']
172
+ }
173
+
174
+ print(f"📊 {model_name} - Test AUC: {eval_results['eval_auc']:.4f}, F1: {eval_results['eval_f1']:.4f}")
175
+
176
+ return results
src/preprocess.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def clean_text(text):
2
+ import re
3
+ import ftfy
4
+
5
+ # Replace newlines, tabs, carriage returns with space
6
+ text = re.sub(r'[\n\r\t]', ' ', text)
7
+ # Strip leading and trailing whitespace
8
+ text = text.strip()
9
+ # Remove excessive spaces
10
+ text = re.sub(r'\s+', ' ', text)
11
+ # Fix encoding artifacts
12
+ text = ftfy.fix_text(text)
13
+
14
+ return text
15
+
16
+ def preprocess_data(df):
17
+ # Apply cleaning to the 'comment_text' column
18
+ df['comment_text'] = df['comment_text'].apply(clean_text)
19
+ return df
src/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd4084611bd27c939ba98e5e63bc3e5a2c1a4e99477dcba46c829e4c986c429d
3
+ size 68802655
src/utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def compute_metrics(eval_pred):
2
+ predictions, labels = eval_pred
3
+ predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
4
+
5
+ # Convert to binary predictions
6
+ binary_predictions = (predictions > 0.5).astype(int)
7
+
8
+ # Calculate metrics
9
+ auc_scores = []
10
+ f1_scores = []
11
+
12
+ for i in range(labels.shape[1]):
13
+ if len(np.unique(labels[:, i])) > 1: # Check if both classes exist
14
+ auc = roc_auc_score(labels[:, i], predictions[:, i])
15
+ auc_scores.append(auc)
16
+ f1 = f1_score(labels[:, i], binary_predictions[:, i])
17
+ f1_scores.append(f1)
18
+
19
+ return {
20
+ 'auc': np.mean(auc_scores),
21
+ 'f1': np.mean(f1_scores)
22
+ }