| import pandas as pd
|
| from sklearn.model_selection import train_test_split
|
|
|
|
|
| df = pd.read_excel("data/sample.xlsx")
|
|
|
| print("Columns in dataset:", df.columns)
|
| print("First few rows:")
|
| print(df.head())
|
|
|
|
|
|
|
|
|
|
|
| question_columns = [col for col in df.columns if col not in ['ID', 'PHQ-9 Score', 'Severity Level']]
|
|
|
|
|
| df['text'] = df[question_columns].astype(str).agg(' | '.join, axis=1)
|
|
|
|
|
| df = df.rename(columns={"Severity Level": "label"})
|
|
|
|
|
| df = df.dropna(subset=['label'])
|
|
|
| print(f"Number of samples: {len(df)}")
|
| print(f"Unique labels: {df['label'].unique()}")
|
|
|
|
|
| label2id = {label: i for i, label in enumerate(df["label"].unique())}
|
| id2label = {i: label for label, i in label2id.items()}
|
| df["label"] = df["label"].map(label2id)
|
|
|
|
|
| train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
|
|
|
|
|
| train_df = train_df[['text', 'label']]
|
| test_df = test_df[['text', 'label']]
|
|
|
|
|
| train_df.to_csv("data/train.csv", index=False)
|
| test_df.to_csv("data/test.csv", index=False)
|
|
|
| print("✅ Data processed and saved as train.csv & test.csv")
|
| print("Label mapping:", label2id)
|
| print(f"Training samples: {len(train_df)}")
|
| print(f"Test samples: {len(test_df)}") |