In [2]:
pip install pandas datasets transformers peft accelerate bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m60.1/60.1 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1


In [3]:
import pandas as pd
import json
import os

files = [
    "/content/Stress.csv",
    "/content/ds.csv",
    "/content/ocd_patient_dataset.csv",
    "/content/synthetic_ptsd_patients.csv",
    "/content/Mental health Depression disorder Data.csv",
    "/content/Copy of ds.csv"
]

question_cols = [
    "question", "questions", "prompt", "input", "q", "query", "text", "phrase"
]
answer_cols = [
    "answer", "answers", "response", "output", "a", "completion", "reply", "label", "sentiment"
]

all_dfs = []
qa_examples = []  # For mental health depression dataset

for f in files:
    ext = os.path.splitext(f)[1].lower()
    if ext != ".csv":
        print(f"‚ö†Ô∏è Skipping non-CSV file: {f}")
        continue

    try:
        try:
            df = pd.read_csv(f, encoding="utf-8")
        except UnicodeDecodeError:
            df = pd.read_csv(f, encoding="latin1")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not read {f}: {e}")
        continue

    df.columns = [c.strip() for c in df.columns]

    # Process Mental health Depression disorder Data.csv using your working snippet
    if os.path.basename(f) == "Mental health Depression disorder Data.csv":
        disorder_cols = [
            'Schizophrenia (%)',
            'Bipolar disorder (%)',
            'Eating disorders (%)',
            'Anxiety disorders (%)',
            'Drug use disorders (%)',
            'Depression (%)',
            'Alcohol use disorders (%)'
        ]

        for _, row in df.iterrows():
            entity = row['Entity']
            year = row['Year']
            for disorder in disorder_cols:
                value = row[disorder]
                if pd.isna(value):
                    continue
                question = f"What is the percentage of {disorder.replace(' (%)', '')} in {entity} for year {year}?"
                answer = f"{value}%"
                qa_examples.append({
                    "instruction": question,
                    "input": "",
                    "output": answer
                })
        print(f"‚úÖ Generated {len(qa_examples)} Q&A examples from {f}")

    elif os.path.basename(f) == "ocd_patient_dataset.csv":
        if "Obsession Type" in df.columns and "Compulsion Type" in df.columns:
            df_sub = df[["Obsession Type", "Compulsion Type"]].rename(columns={
                "Obsession Type": "Question",
                "Compulsion Type": "Answer"
            })
            df_sub = df_sub.dropna(subset=["Question", "Answer"])
            all_dfs.append(df_sub)
            print(f"‚úÖ Loaded {len(df_sub)} rows from {f}")
        else:
            print(f"‚ö†Ô∏è Skipping {f}: Missing 'Obsession Type' or 'Compulsion Type' columns")

    elif os.path.basename(f) == "synthetic_ptsd_patients.csv":
        if "trauma_type" in df.columns and "has_ptsd" in df.columns:
            df_sub = df[["trauma_type", "has_ptsd"]].rename(columns={
                "trauma_type": "Question",
                "has_ptsd": "Answer"
            })
            df_sub = df_sub.dropna(subset=["Question", "Answer"])
            all_dfs.append(df_sub)
            print(f"‚úÖ Loaded {len(df_sub)} rows from {f}")
        else:
            print(f"‚ö†Ô∏è Skipping {f}: Missing 'trauma_type' or 'has_ptsd' columns")

    else:
        # Generic Q&A detection for other files
        df.columns = [c.lower() for c in df.columns]  # normalize for matching
        q_col = next((c for c in df.columns if c in [qc.lower() for qc in question_cols]), None)
        a_col = next((c for c in df.columns if c in [ac.lower() for ac in answer_cols]), None)
        if not q_col or not a_col:
            print(f"‚ö†Ô∏è Skipping {f}: Missing 'Question' or 'Answer' column")
            continue
        df_sub = df[[q_col, a_col]].rename(columns={q_col: "Question", a_col: "Answer"})
        df_sub = df_sub.dropna(subset=["Question", "Answer"])
        all_dfs.append(df_sub)
        print(f"‚úÖ Loaded {len(df_sub)} rows from {f}")

# Combine normal Q/A datasets
if all_dfs:
    df_combined = pd.concat(all_dfs, ignore_index=True)
else:
    df_combined = pd.DataFrame(columns=["Question", "Answer"])

# Write all data to JSONL
with open("training_data.jsonl", "w", encoding="utf-8") as f_out:
    for _, row in df_combined.iterrows():
        q = str(row["Question"]).strip()
        a = str(row["Answer"]).strip()
        if q and a:
            example = {"instruction": q, "input": "", "output": a}
            f_out.write(json.dumps(example, ensure_ascii=False) + "\n")

    for example in qa_examples:
        f_out.write(json.dumps(example, ensure_ascii=False) + "\n")

total = len(df_combined) + len(qa_examples)
print(f"‚úÖ Saved merged dataset with {total} examples to training_data.jsonl")


‚úÖ Loaded 2838 rows from /content/Stress.csv
‚úÖ Loaded 3479 rows from /content/ds.csv
‚úÖ Loaded 1500 rows from /content/ocd_patient_dataset.csv
‚úÖ Loaded 500 rows from /content/synthetic_ptsd_patients.csv
‚úÖ Generated 45276 Q&A examples from /content/Mental health Depression disorder Data.csv
‚úÖ Loaded 3479 rows from /content/Copy of ds.csv
‚úÖ Saved merged dataset with 57072 examples to training_data.jsonl


In [4]:
# Show head of each loaded CSV / processed DF
for f in files:
    ext = os.path.splitext(f)[1].lower()
    if ext != ".csv":
        continue

    try:
        try:
            df = pd.read_csv(f, encoding="utf-8")
        except UnicodeDecodeError:
            df = pd.read_csv(f, encoding="latin1")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not read {f}: {e}")
        continue

    print(f"\nüìÑ Head of {os.path.basename(f)}:")
    display(df.head(5))  # top 5 rows



üìÑ Head of Stress.csv:


Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.8,1521614353
1,assistance,8lbrx9,"(0, 5)","Hey there r/assistance, Not sure if this is th...",0,1.0,1527009817
2,ptsd,9ch1zh,"(15, 20)",My mom then hit me with the newspaper and it s...,1,0.8,1535935605
3,relationships,7rorpp,"[5, 10]","until i met my new boyfriend, he is amazing, h...",1,0.6,1516429555
4,survivorsofabuse,9p2gbc,"[0, 5]",October is Domestic Violence Awareness Month a...,1,0.8,1539809005



üìÑ Head of ds.csv:


Unnamed: 0,phrase,sentiment
0,"""I love spending time with my family.""",positive
1,"""Sunshine always brightens my day.""",positive
2,"""Helping others is so rewarding.""",positive
3,"""A good book can transport you to another world.""",positive
4,"""The smell of freshly baked bread is amazing.""",positive



üìÑ Head of ocd_patient_dataset.csv:


Unnamed: 0,Patient ID,Age,Gender,Ethnicity,Marital Status,Education Level,OCD Diagnosis Date,Duration of Symptoms (months),Previous Diagnoses,Family History of OCD,Obsession Type,Compulsion Type,Y-BOCS Score (Obsessions),Y-BOCS Score (Compulsions),Depression Diagnosis,Anxiety Diagnosis,Medications
0,1018,32,Female,African,Single,Some College,2016-07-15,203,MDD,No,Harm-related,Checking,17,10,Yes,Yes,SNRI
1,2406,69,Male,African,Divorced,Some College,2017-04-28,180,,Yes,Harm-related,Washing,21,25,Yes,Yes,SSRI
2,1188,57,Male,Hispanic,Divorced,College Degree,2018-02-02,173,MDD,No,Contamination,Checking,3,4,No,No,Benzodiazepine
3,6200,27,Female,Hispanic,Married,College Degree,2014-08-25,126,PTSD,Yes,Symmetry,Washing,14,28,Yes,Yes,SSRI
4,5824,56,Female,Hispanic,Married,High School,2022-02-20,168,PTSD,Yes,Hoarding,Ordering,39,18,No,No,



üìÑ Head of synthetic_ptsd_patients.csv:


Unnamed: 0,patient_id,age,gender,trauma_type,intrusive_thoughts,nightmares,avoidance,negative_mood,hypervigilance,pcl5_score,has_ptsd
0,1,56,Female,Accident,2,3,3,1,4,52,1
1,2,69,Male,Combat-related,4,1,2,1,4,48,1
2,3,46,Male,Other,2,3,4,2,1,48,1
3,4,32,Female,Other,1,1,3,0,4,27,0
4,5,60,Male,Natural disaster,0,4,4,0,4,36,1



üìÑ Head of Mental health Depression disorder Data.csv:


Unnamed: 0,Entity,Code,Year,Schizophrenia (%),Bipolar disorder (%),Eating disorders (%),Anxiety disorders (%),Drug use disorders (%),Depression (%),Alcohol use disorders (%)
0,Afghanistan,AFG,1990,0.16056,0.697779,0.101855,4.82883,1.677082,4.071831,0.672404
1,Afghanistan,AFG,1991,0.160312,0.697961,0.099313,4.82974,1.684746,4.079531,0.671768
2,Afghanistan,AFG,1992,0.160135,0.698107,0.096692,4.831108,1.694334,4.088358,0.670644
3,Afghanistan,AFG,1993,0.160037,0.698257,0.094336,4.830864,1.70532,4.09619,0.669738
4,Afghanistan,AFG,1994,0.160022,0.698469,0.092439,4.829423,1.716069,4.099582,0.66926



üìÑ Head of Copy of ds.csv:


Unnamed: 0,phrase,sentiment
0,"""I love spending time with my family.""",positive
1,"""Sunshine always brightens my day.""",positive
2,"""Helping others is so rewarding.""",positive
3,"""A good book can transport you to another world.""",positive
4,"""The smell of freshly baked bread is amazing.""",positive


In [6]:
from datasets import load_dataset

# Load the JSONL dataset
dataset = load_dataset(
    "json",
    data_files="/content/training_data.jsonl",  # or training_data.jsonl
    split="train"
)

print(dataset[0])  # preview first item


Generating train split: 0 examples [00:00, ? examples/s]

{'instruction': 'He said he had not felt that way before, suggeted I go rest and so ..TRIGGER AHEAD IF YOUI\'RE A HYPOCONDRIAC LIKE ME: i decide to look up "feelings of doom" in hopes of maybe getting sucked into some rabbit hole of ludicrous conspiracy, a stupid "are you psychic" test or new age b.s., something I could even laugh at down the road. No, I ended up reading that this sense of doom can be indicative of various health ailments; one of which I am prone to.. So on top of my "doom" to my gloom..I am now f\'n worried about my heart. I do happen to have a physical in 48 hours.', 'input': '', 'output': '1'}


In [7]:
import torch
torch.cuda.empty_cache()


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch

model_name = "tanusrich/Mental_Health_Chatbot"
data_path = "training_data.jsonl"

# Load dataset
dataset = load_dataset("json", data_files=data_path, split="train")

# Load tokenizer and model with 4-bit quantization using BitsAndBytesConfig (new way)
bnb_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16, # Added this parameter
)

# LoRA configuration for LLaMA-based model
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # typical for LLaMA
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Tokenization helper (fixed)
def preprocess(examples):
    inputs = [
        instr + "\n" + inp if inp else instr
        for instr, inp in zip(
            examples["instruction"],
            examples.get("input", [""] * len(examples["instruction"]))
        )
    ]
    outputs = examples["output"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=512, truncation=True, padding="max_length").input_ids

    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names,
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./lora_mht_chatbot_finetuned",
    per_device_train_batch_size=1,  # Reduced batch size
    gradient_accumulation_steps=4,  # Adjusted accumulation steps
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    save_strategy="epoch",
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_total_limit=2,
    report_to="none",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

# Save adapters and tokenizer
model.save_pretrained("./lora_mht_chatbot_finetuned")
tokenizer.save_pretrained("./lora_mht_chatbot_finetuned")

print("‚úÖ Fine-tuning complete and saved!")

Generating train split: 0 examples [00:00, ? examples/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/57072 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
10,23.6531
20,23.3329
30,23.2276
40,23.2085
50,23.4249
60,23.4031
70,23.3057
80,23.3682
90,23.2836
100,23.233
