Mindspace / src /dataset.py
akhilesh98's picture
Upload AI_Model (code + final model)
b018a0a verified
import pandas as pd
from sklearn.model_selection import train_test_split
# Load your Excel dataset
df = pd.read_excel("data/sample.xlsx")
print("Columns in dataset:", df.columns)
print("First few rows:")
print(df.head())
# For PHQ-9 dataset, we'll use the individual question responses as features
# and 'Severity Level' as the target label
# Get all the PHQ-9 question columns (exclude ID, PHQ-9 Score, and Severity Level)
question_columns = [col for col in df.columns if col not in ['ID', 'PHQ-9 Score', 'Severity Level']]
# Combine all question responses into a single text feature
df['text'] = df[question_columns].astype(str).agg(' | '.join, axis=1)
# Rename the severity level column to 'label'
df = df.rename(columns={"Severity Level": "label"})
# Remove rows with missing labels
df = df.dropna(subset=['label'])
print(f"Number of samples: {len(df)}")
print(f"Unique labels: {df['label'].unique()}")
# Encode labels into numbers
label2id = {label: i for i, label in enumerate(df["label"].unique())}
id2label = {i: label for label, i in label2id.items()}
df["label"] = df["label"].map(label2id)
# Split into train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
# Keep only the columns we need for training
train_df = train_df[['text', 'label']]
test_df = test_df[['text', 'label']]
# Save CSVs
train_df.to_csv("data/train.csv", index=False)
test_df.to_csv("data/test.csv", index=False)
print("✅ Data processed and saved as train.csv & test.csv")
print("Label mapping:", label2id)
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")