import pandas as pd from sklearn.model_selection import train_test_split # Load your Excel dataset df = pd.read_excel("data/sample.xlsx") print("Columns in dataset:", df.columns) print("First few rows:") print(df.head()) # For PHQ-9 dataset, we'll use the individual question responses as features # and 'Severity Level' as the target label # Get all the PHQ-9 question columns (exclude ID, PHQ-9 Score, and Severity Level) question_columns = [col for col in df.columns if col not in ['ID', 'PHQ-9 Score', 'Severity Level']] # Combine all question responses into a single text feature df['text'] = df[question_columns].astype(str).agg(' | '.join, axis=1) # Rename the severity level column to 'label' df = df.rename(columns={"Severity Level": "label"}) # Remove rows with missing labels df = df.dropna(subset=['label']) print(f"Number of samples: {len(df)}") print(f"Unique labels: {df['label'].unique()}") # Encode labels into numbers label2id = {label: i for i, label in enumerate(df["label"].unique())} id2label = {i: label for label, i in label2id.items()} df["label"] = df["label"].map(label2id) # Split into train/test train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label']) # Keep only the columns we need for training train_df = train_df[['text', 'label']] test_df = test_df[['text', 'label']] # Save CSVs train_df.to_csv("data/train.csv", index=False) test_df.to_csv("data/test.csv", index=False) print("✅ Data processed and saved as train.csv & test.csv") print("Label mapping:", label2id) print(f"Training samples: {len(train_df)}") print(f"Test samples: {len(test_df)}")