Mental_Health_Text_Classification / mental_health_text_classification.py
OmidSakaki's picture
Upload mental_health_text_classification.py
c1007cf verified
# -*- coding: utf-8 -*-
"""Mental Health Text Classification.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/11fDg8hX2svH1yGzRUU8Ji4ooR_4EoVXL
# **1. Install packages**
"""
!pip install -q --upgrade \
transformers==4.51.0 \
datasets==3.1.0 \
peft==0.13.2 \
accelerate==1.0.1 \
evaluate \
scikit-learn \
matplotlib seaborn wordcloud
"""# **2. Imports**"""
import os
import warnings
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from huggingface_hub import login
from google.colab import userdata, files
warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = (10, 6)
sns.set(style="whitegrid")
"""# **3. Hugging Face login**"""
try:
login(token=userdata.get('HF_TOKEN'))
print("Successfully logged in using Colab secret")
except Exception as e:
print(f"Secret login failed: {e}")
try:
login()
print("Interactive login successful")
except:
print("Login skipped – may hit rate limits")
"""# **4. Download dataset from Kaggle**"""
if not os.path.exists('/root/.kaggle/kaggle.json'):
print("Please upload kaggle.json")
uploaded = files.upload()
if 'kaggle.json' in uploaded:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d priyangshumukherjee/mental-health-text-classification-dataset --unzip -p ./data -q
print("Downloaded files:", os.listdir('./data'))
"""# **5. Load data & prepare labels**"""
TRAIN_PATH = './data/mental_heath_unbanlanced.csv'
TEST_PATH = './data/mental_health_combined_test.csv'
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)
label2id = {'Normal': 0, 'Depression': 1, 'Anxiety': 2, 'Suicidal': 3}
id2label = {v: k for k, v in label2id.items()}
df_train['label'] = df_train['status'].map(label2id).astype(int)
df_test['label'] = df_test['status'].map(label2id).astype(int)
df_train = df_train.rename(columns={'label': 'labels'})
df_test = df_test.rename(columns={'label': 'labels'})
# Stratified split
train_df, val_df = train_test_split(
df_train,
test_size=0.12,
stratify=df_train['labels'],
random_state=42
)
train_ds = Dataset.from_pandas(train_df[['text', 'labels']].reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df[['text', 'labels']].reset_index(drop=True))
test_ds = Dataset.from_pandas(df_test[['text', 'labels']].reset_index(drop=True))
"""# **6. Tokenization**"""
MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=224,
padding=False
)
tokenized_train = train_ds.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_ds.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_test = test_ds.map(tokenize_function, batched=True, remove_columns=["text"])
"""# **7. Load model**"""
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=4,
id2label=id2label,
label2id=label2id,
ignore_mismatched_sizes=True,
torch_dtype=torch.float16,
).to(device)
"""# **8. Apply LoRA**"""
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["query_proj", "value_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_CLS,
modules_to_save=["classifier"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
for name, param in model.named_parameters():
if param.requires_grad:
param.data = param.data.float()
print("Trainable parameters")
"""# **9. Metrics**"""
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = np.argmax(logits, axis=1)
acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average="weighted")
return {"accuracy": acc, "f1": f1}
"""# **10. TrainingArguments**"""
training_args = TrainingArguments(
output_dir = "./mental_health_deberta_lora",
num_train_epochs = 4,
per_device_train_batch_size = 8,
per_device_eval_batch_size = 16,
gradient_accumulation_steps = 2,
learning_rate = 1.5e-4,
weight_decay = 0.01,
warmup_ratio = 0.1,
fp16 = True,
eval_strategy = "epoch",
save_strategy = "epoch",
logging_steps = 100,
load_best_model_at_end = True,
metric_for_best_model = "f1",
greater_is_better = True,
report_to = "none",
optim = "adamw_torch",
max_grad_norm = 0.5,
lr_scheduler_type = "cosine",
dataloader_num_workers = 2,
remove_unused_columns = False,
)
"""# **11. Trainer**"""
trainer = Trainer(
model = model,
args = training_args,
train_dataset = tokenized_train,
eval_dataset = tokenized_val,
tokenizer = tokenizer,
data_collator = DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics = compute_metrics,
)
"""# **12. training**"""
print("Starting training...")
trainer.train()
"""# **13. Evaluate & plot**"""
test_results = trainer.evaluate(tokenized_test)
print("\nTest results:", test_results)
predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids
print("\nClassification Report:\n")
print(classification_report(true_labels, preds, target_names=list(id2label.values())))
# Confusion Matrix
cm = confusion_matrix(true_labels, preds)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
xticklabels=list(id2label.values()),
yticklabels=list(id2label.values()))
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix – Balanced Test Set")
plt.show()
"""# **14. Save LoRA adapter**"""
print("Merging LoRA weights into base model...")
merged_model = model.merge_and_unload()
# Optional: Save merged model locally first (for backup)
merged_model.save_pretrained("./merged_mental_health_deberta")
tokenizer.save_pretrained("./merged_mental_health_deberta")
print("Merged model saved locally.")
"""# **15. Push merged model + tokenizer to Hugging Face Hub**"""
repo_id = "OmidSakaki/mental-health-deberta"
print(f"Pushing merged model to: https://huggingface.co/{repo_id}")
merged_model.push_to_hub(
repo_id=repo_id,
commit_message="Full merged model after LoRA fine-tuning (4-class mental health classification)",
safe_serialization=True,
private=False
)
tokenizer.push_to_hub(
repo_id=repo_id,
commit_message="Tokenizer for merged mental health model"
)
print("Upload completed! Model is now live at:", f"https://huggingface.co/{repo_id}")