OmidSakaki commited on
Commit
c1007cf
·
verified ·
1 Parent(s): dabdef3

Upload mental_health_text_classification.py

Browse files
Files changed (1) hide show
  1. mental_health_text_classification.py +256 -0
mental_health_text_classification.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Mental Health Text Classification.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/11fDg8hX2svH1yGzRUU8Ji4ooR_4EoVXL
8
+
9
+ # **1. Install packages**
10
+ """
11
+
12
+ !pip install -q --upgrade \
13
+ transformers==4.51.0 \
14
+ datasets==3.1.0 \
15
+ peft==0.13.2 \
16
+ accelerate==1.0.1 \
17
+ evaluate \
18
+ scikit-learn \
19
+ matplotlib seaborn wordcloud
20
+
21
+ """# **2. Imports**"""
22
+
23
+ import os
24
+ import warnings
25
+ import torch
26
+ import numpy as np
27
+ import pandas as pd
28
+ import matplotlib.pyplot as plt
29
+ import seaborn as sns
30
+ from sklearn.model_selection import train_test_split
31
+ from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
32
+ from transformers import (
33
+ AutoTokenizer,
34
+ AutoModelForSequenceClassification,
35
+ Trainer,
36
+ TrainingArguments,
37
+ DataCollatorWithPadding,
38
+ )
39
+ from peft import LoraConfig, get_peft_model, TaskType
40
+ from datasets import Dataset
41
+ from huggingface_hub import login
42
+ from google.colab import userdata, files
43
+
44
+ warnings.filterwarnings("ignore")
45
+ plt.rcParams['figure.figsize'] = (10, 6)
46
+ sns.set(style="whitegrid")
47
+
48
+ """# **3. Hugging Face login**"""
49
+
50
+ try:
51
+ login(token=userdata.get('HF_TOKEN'))
52
+ print("Successfully logged in using Colab secret")
53
+ except Exception as e:
54
+ print(f"Secret login failed: {e}")
55
+ try:
56
+ login()
57
+ print("Interactive login successful")
58
+ except:
59
+ print("Login skipped – may hit rate limits")
60
+
61
+ """# **4. Download dataset from Kaggle**"""
62
+
63
+ if not os.path.exists('/root/.kaggle/kaggle.json'):
64
+ print("Please upload kaggle.json")
65
+ uploaded = files.upload()
66
+ if 'kaggle.json' in uploaded:
67
+ !mkdir -p ~/.kaggle
68
+ !cp kaggle.json ~/.kaggle/
69
+ !chmod 600 ~/.kaggle/kaggle.json
70
+
71
+ !kaggle datasets download -d priyangshumukherjee/mental-health-text-classification-dataset --unzip -p ./data -q
72
+ print("Downloaded files:", os.listdir('./data'))
73
+
74
+ """# **5. Load data & prepare labels**"""
75
+
76
+ TRAIN_PATH = './data/mental_heath_unbanlanced.csv'
77
+ TEST_PATH = './data/mental_health_combined_test.csv'
78
+
79
+ df_train = pd.read_csv(TRAIN_PATH)
80
+ df_test = pd.read_csv(TEST_PATH)
81
+
82
+ label2id = {'Normal': 0, 'Depression': 1, 'Anxiety': 2, 'Suicidal': 3}
83
+ id2label = {v: k for k, v in label2id.items()}
84
+
85
+ df_train['label'] = df_train['status'].map(label2id).astype(int)
86
+ df_test['label'] = df_test['status'].map(label2id).astype(int)
87
+
88
+ df_train = df_train.rename(columns={'label': 'labels'})
89
+ df_test = df_test.rename(columns={'label': 'labels'})
90
+
91
+ # Stratified split
92
+ train_df, val_df = train_test_split(
93
+ df_train,
94
+ test_size=0.12,
95
+ stratify=df_train['labels'],
96
+ random_state=42
97
+ )
98
+
99
+ train_ds = Dataset.from_pandas(train_df[['text', 'labels']].reset_index(drop=True))
100
+ val_ds = Dataset.from_pandas(val_df[['text', 'labels']].reset_index(drop=True))
101
+ test_ds = Dataset.from_pandas(df_test[['text', 'labels']].reset_index(drop=True))
102
+
103
+ """# **6. Tokenization**"""
104
+
105
+ MODEL_NAME = "microsoft/deberta-v3-base"
106
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
107
+
108
+ def tokenize_function(examples):
109
+ return tokenizer(
110
+ examples["text"],
111
+ truncation=True,
112
+ max_length=224,
113
+ padding=False
114
+ )
115
+
116
+ tokenized_train = train_ds.map(tokenize_function, batched=True, remove_columns=["text"])
117
+ tokenized_val = val_ds.map(tokenize_function, batched=True, remove_columns=["text"])
118
+ tokenized_test = test_ds.map(tokenize_function, batched=True, remove_columns=["text"])
119
+
120
+ """# **7. Load model**"""
121
+
122
+ device = "cuda" if torch.cuda.is_available() else "cpu"
123
+
124
+ model = AutoModelForSequenceClassification.from_pretrained(
125
+ MODEL_NAME,
126
+ num_labels=4,
127
+ id2label=id2label,
128
+ label2id=label2id,
129
+ ignore_mismatched_sizes=True,
130
+ torch_dtype=torch.float16,
131
+ ).to(device)
132
+
133
+ """# **8. Apply LoRA**"""
134
+
135
+ lora_config = LoraConfig(
136
+ r=16,
137
+ lora_alpha=32,
138
+ target_modules=["query_proj", "value_proj"],
139
+ lora_dropout=0.05,
140
+ bias="none",
141
+ task_type=TaskType.SEQ_CLS,
142
+ modules_to_save=["classifier"]
143
+ )
144
+
145
+ model = get_peft_model(model, lora_config)
146
+ model.print_trainable_parameters()
147
+
148
+ for name, param in model.named_parameters():
149
+ if param.requires_grad:
150
+ param.data = param.data.float()
151
+
152
+ print("Trainable parameters")
153
+
154
+ """# **9. Metrics**"""
155
+
156
+ def compute_metrics(eval_pred):
157
+ logits, labels = eval_pred
158
+ preds = np.argmax(logits, axis=1)
159
+ acc = accuracy_score(labels, preds)
160
+ f1 = f1_score(labels, preds, average="weighted")
161
+ return {"accuracy": acc, "f1": f1}
162
+
163
+ """# **10. TrainingArguments**"""
164
+
165
+ training_args = TrainingArguments(
166
+ output_dir = "./mental_health_deberta_lora",
167
+ num_train_epochs = 4,
168
+ per_device_train_batch_size = 8,
169
+ per_device_eval_batch_size = 16,
170
+ gradient_accumulation_steps = 2,
171
+ learning_rate = 1.5e-4,
172
+ weight_decay = 0.01,
173
+ warmup_ratio = 0.1,
174
+ fp16 = True,
175
+ eval_strategy = "epoch",
176
+ save_strategy = "epoch",
177
+ logging_steps = 100,
178
+ load_best_model_at_end = True,
179
+ metric_for_best_model = "f1",
180
+ greater_is_better = True,
181
+ report_to = "none",
182
+ optim = "adamw_torch",
183
+ max_grad_norm = 0.5,
184
+ lr_scheduler_type = "cosine",
185
+ dataloader_num_workers = 2,
186
+ remove_unused_columns = False,
187
+ )
188
+
189
+ """# **11. Trainer**"""
190
+
191
+ trainer = Trainer(
192
+ model = model,
193
+ args = training_args,
194
+ train_dataset = tokenized_train,
195
+ eval_dataset = tokenized_val,
196
+ tokenizer = tokenizer,
197
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer),
198
+ compute_metrics = compute_metrics,
199
+ )
200
+
201
+ """# **12. training**"""
202
+
203
+ print("Starting training...")
204
+ trainer.train()
205
+
206
+ """# **13. Evaluate & plot**"""
207
+
208
+ test_results = trainer.evaluate(tokenized_test)
209
+ print("\nTest results:", test_results)
210
+
211
+ predictions = trainer.predict(tokenized_test)
212
+ preds = np.argmax(predictions.predictions, axis=1)
213
+ true_labels = predictions.label_ids
214
+
215
+ print("\nClassification Report:\n")
216
+ print(classification_report(true_labels, preds, target_names=list(id2label.values())))
217
+
218
+ # Confusion Matrix
219
+ cm = confusion_matrix(true_labels, preds)
220
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
221
+ xticklabels=list(id2label.values()),
222
+ yticklabels=list(id2label.values()))
223
+ plt.xlabel("Predicted")
224
+ plt.ylabel("True")
225
+ plt.title("Confusion Matrix – Balanced Test Set")
226
+ plt.show()
227
+
228
+ """# **14. Save LoRA adapter**"""
229
+
230
+ print("Merging LoRA weights into base model...")
231
+ merged_model = model.merge_and_unload()
232
+
233
+ # Optional: Save merged model locally first (for backup)
234
+ merged_model.save_pretrained("./merged_mental_health_deberta")
235
+ tokenizer.save_pretrained("./merged_mental_health_deberta")
236
+ print("Merged model saved locally.")
237
+
238
+ """# **15. Push merged model + tokenizer to Hugging Face Hub**"""
239
+
240
+ repo_id = "OmidSakaki/mental-health-deberta"
241
+
242
+ print(f"Pushing merged model to: https://huggingface.co/{repo_id}")
243
+
244
+ merged_model.push_to_hub(
245
+ repo_id=repo_id,
246
+ commit_message="Full merged model after LoRA fine-tuning (4-class mental health classification)",
247
+ safe_serialization=True,
248
+ private=False
249
+ )
250
+
251
+ tokenizer.push_to_hub(
252
+ repo_id=repo_id,
253
+ commit_message="Tokenizer for merged mental health model"
254
+ )
255
+
256
+ print("Upload completed! Model is now live at:", f"https://huggingface.co/{repo_id}")