| |
| """Mental Health Text Classification.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/11fDg8hX2svH1yGzRUU8Ji4ooR_4EoVXL |
| |
| # **1. Install packages** |
| """ |
|
|
| !pip install -q --upgrade \ |
| transformers==4.51.0 \ |
| datasets==3.1.0 \ |
| peft==0.13.2 \ |
| accelerate==1.0.1 \ |
| evaluate \ |
| scikit-learn \ |
| matplotlib seaborn wordcloud |
|
|
| """# **2. Imports**""" |
|
|
| import os |
| import warnings |
| import torch |
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForSequenceClassification, |
| Trainer, |
| TrainingArguments, |
| DataCollatorWithPadding, |
| ) |
| from peft import LoraConfig, get_peft_model, TaskType |
| from datasets import Dataset |
| from huggingface_hub import login |
| from google.colab import userdata, files |
|
|
| warnings.filterwarnings("ignore") |
| plt.rcParams['figure.figsize'] = (10, 6) |
| sns.set(style="whitegrid") |
|
|
| """# **3. Hugging Face login**""" |
|
|
| try: |
| login(token=userdata.get('HF_TOKEN')) |
| print("Successfully logged in using Colab secret") |
| except Exception as e: |
| print(f"Secret login failed: {e}") |
| try: |
| login() |
| print("Interactive login successful") |
| except: |
| print("Login skipped – may hit rate limits") |
|
|
| """# **4. Download dataset from Kaggle**""" |
|
|
| if not os.path.exists('/root/.kaggle/kaggle.json'): |
| print("Please upload kaggle.json") |
| uploaded = files.upload() |
| if 'kaggle.json' in uploaded: |
| !mkdir -p ~/.kaggle |
| !cp kaggle.json ~/.kaggle/ |
| !chmod 600 ~/.kaggle/kaggle.json |
|
|
| !kaggle datasets download -d priyangshumukherjee/mental-health-text-classification-dataset --unzip -p ./data -q |
| print("Downloaded files:", os.listdir('./data')) |
|
|
| """# **5. Load data & prepare labels**""" |
|
|
| TRAIN_PATH = './data/mental_heath_unbanlanced.csv' |
| TEST_PATH = './data/mental_health_combined_test.csv' |
|
|
| df_train = pd.read_csv(TRAIN_PATH) |
| df_test = pd.read_csv(TEST_PATH) |
|
|
| label2id = {'Normal': 0, 'Depression': 1, 'Anxiety': 2, 'Suicidal': 3} |
| id2label = {v: k for k, v in label2id.items()} |
|
|
| df_train['label'] = df_train['status'].map(label2id).astype(int) |
| df_test['label'] = df_test['status'].map(label2id).astype(int) |
|
|
| df_train = df_train.rename(columns={'label': 'labels'}) |
| df_test = df_test.rename(columns={'label': 'labels'}) |
|
|
| |
| train_df, val_df = train_test_split( |
| df_train, |
| test_size=0.12, |
| stratify=df_train['labels'], |
| random_state=42 |
| ) |
|
|
| train_ds = Dataset.from_pandas(train_df[['text', 'labels']].reset_index(drop=True)) |
| val_ds = Dataset.from_pandas(val_df[['text', 'labels']].reset_index(drop=True)) |
| test_ds = Dataset.from_pandas(df_test[['text', 'labels']].reset_index(drop=True)) |
|
|
| """# **6. Tokenization**""" |
|
|
| MODEL_NAME = "microsoft/deberta-v3-base" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
| def tokenize_function(examples): |
| return tokenizer( |
| examples["text"], |
| truncation=True, |
| max_length=224, |
| padding=False |
| ) |
|
|
| tokenized_train = train_ds.map(tokenize_function, batched=True, remove_columns=["text"]) |
| tokenized_val = val_ds.map(tokenize_function, batched=True, remove_columns=["text"]) |
| tokenized_test = test_ds.map(tokenize_function, batched=True, remove_columns=["text"]) |
|
|
| """# **7. Load model**""" |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| model = AutoModelForSequenceClassification.from_pretrained( |
| MODEL_NAME, |
| num_labels=4, |
| id2label=id2label, |
| label2id=label2id, |
| ignore_mismatched_sizes=True, |
| torch_dtype=torch.float16, |
| ).to(device) |
|
|
| """# **8. Apply LoRA**""" |
|
|
| lora_config = LoraConfig( |
| r=16, |
| lora_alpha=32, |
| target_modules=["query_proj", "value_proj"], |
| lora_dropout=0.05, |
| bias="none", |
| task_type=TaskType.SEQ_CLS, |
| modules_to_save=["classifier"] |
| ) |
|
|
| model = get_peft_model(model, lora_config) |
| model.print_trainable_parameters() |
|
|
| for name, param in model.named_parameters(): |
| if param.requires_grad: |
| param.data = param.data.float() |
|
|
| print("Trainable parameters") |
|
|
| """# **9. Metrics**""" |
|
|
| def compute_metrics(eval_pred): |
| logits, labels = eval_pred |
| preds = np.argmax(logits, axis=1) |
| acc = accuracy_score(labels, preds) |
| f1 = f1_score(labels, preds, average="weighted") |
| return {"accuracy": acc, "f1": f1} |
|
|
| """# **10. TrainingArguments**""" |
|
|
| training_args = TrainingArguments( |
| output_dir = "./mental_health_deberta_lora", |
| num_train_epochs = 4, |
| per_device_train_batch_size = 8, |
| per_device_eval_batch_size = 16, |
| gradient_accumulation_steps = 2, |
| learning_rate = 1.5e-4, |
| weight_decay = 0.01, |
| warmup_ratio = 0.1, |
| fp16 = True, |
| eval_strategy = "epoch", |
| save_strategy = "epoch", |
| logging_steps = 100, |
| load_best_model_at_end = True, |
| metric_for_best_model = "f1", |
| greater_is_better = True, |
| report_to = "none", |
| optim = "adamw_torch", |
| max_grad_norm = 0.5, |
| lr_scheduler_type = "cosine", |
| dataloader_num_workers = 2, |
| remove_unused_columns = False, |
| ) |
|
|
| """# **11. Trainer**""" |
|
|
| trainer = Trainer( |
| model = model, |
| args = training_args, |
| train_dataset = tokenized_train, |
| eval_dataset = tokenized_val, |
| tokenizer = tokenizer, |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer), |
| compute_metrics = compute_metrics, |
| ) |
|
|
| """# **12. training**""" |
|
|
| print("Starting training...") |
| trainer.train() |
|
|
| """# **13. Evaluate & plot**""" |
|
|
| test_results = trainer.evaluate(tokenized_test) |
| print("\nTest results:", test_results) |
|
|
| predictions = trainer.predict(tokenized_test) |
| preds = np.argmax(predictions.predictions, axis=1) |
| true_labels = predictions.label_ids |
|
|
| print("\nClassification Report:\n") |
| print(classification_report(true_labels, preds, target_names=list(id2label.values()))) |
|
|
| |
| cm = confusion_matrix(true_labels, preds) |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", |
| xticklabels=list(id2label.values()), |
| yticklabels=list(id2label.values())) |
| plt.xlabel("Predicted") |
| plt.ylabel("True") |
| plt.title("Confusion Matrix – Balanced Test Set") |
| plt.show() |
|
|
| """# **14. Save LoRA adapter**""" |
|
|
| print("Merging LoRA weights into base model...") |
| merged_model = model.merge_and_unload() |
|
|
| |
| merged_model.save_pretrained("./merged_mental_health_deberta") |
| tokenizer.save_pretrained("./merged_mental_health_deberta") |
| print("Merged model saved locally.") |
|
|
| """# **15. Push merged model + tokenizer to Hugging Face Hub**""" |
|
|
| repo_id = "OmidSakaki/mental-health-deberta" |
|
|
| print(f"Pushing merged model to: https://huggingface.co/{repo_id}") |
|
|
| merged_model.push_to_hub( |
| repo_id=repo_id, |
| commit_message="Full merged model after LoRA fine-tuning (4-class mental health classification)", |
| safe_serialization=True, |
| private=False |
| ) |
|
|
| tokenizer.push_to_hub( |
| repo_id=repo_id, |
| commit_message="Tokenizer for merged mental health model" |
| ) |
|
|
| print("Upload completed! Model is now live at:", f"https://huggingface.co/{repo_id}") |