import pandas as pd import numpy as np import torch from torch_geometric.data import Data from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.impute import SimpleImputer import os def load_and_preprocess_data(data_dir='data'): # 1. Load Data clinical = pd.read_csv(os.path.join(data_dir, 'clinical.csv')) blood = pd.read_csv(os.path.join(data_dir, 'blood.csv')) pathological = pd.read_csv(os.path.join(data_dir, 'pathological.csv')) targets = pd.read_csv(os.path.join(data_dir, 'targets.csv')) # 2. Merge Data df = clinical.merge(blood, on='patient_id', how='inner') df = df.merge(pathological, on='patient_id', how='inner') df = df.merge(targets, on='patient_id', how='inner') # 3. Target Definitions # Task A: Recurrence (0: no, 1: yes) df['recurrence_label'] = df['recurrence'].map({'no': 0, 'yes': 1}).fillna(0).astype(int) # Task B: Survival Score (Normalized) # Strategy: (days / max_days) * (1.2 if living else 0.8) max_days = df['days_to_last_information'].max() df['survival_score'] = (df['days_to_last_information'] / max_days) df.loc[df['survival_status'] == 'dead', 'survival_score'] *= 0.8 df.loc[df['survival_status'] == 'living', 'survival_score'] *= 1.2 # Clip to [0, 1] df['survival_score'] = df['survival_score'].clip(0, 1) # 4. Preprocessing # Identify column types blood_cols = [c for c in blood.columns if c != 'patient_id'] pathology_cols = [c for c in pathological.columns if c != 'patient_id'] clinical_cols = [c for c in clinical.columns if c != 'patient_id'] # Impute Blood (Median) blood_imputer = SimpleImputer(strategy='median') df[blood_cols] = blood_imputer.fit_transform(df[blood_cols]) # Impute Pathology (Mode) path_imputer = SimpleImputer(strategy='most_frequent') df[pathology_cols] = path_imputer.fit_transform(df[pathology_cols]) # Feature Engineering / Encoding # For categorical columns that are not IDs or targets cat_cols = ['sex', 'primarily_metastasis', 'smoking_status', 'hpv_association_p16'] for col in cat_cols: if col in df.columns: df[col] = LabelEncoder().fit_transform(df[col].astype(str)) # Scale Numerical Features num_cols = blood_cols + clinical_cols + ['infiltration_depth_in_mm'] num_cols = [c for c in num_cols if c in df.columns and c not in cat_cols] scaler = StandardScaler() df[num_cols] = scaler.fit_transform(df[num_cols]) # 5. Graph Construction (The Clinical Twin) # Nodes: patients in df node_features = df[num_cols + cat_cols].values x = torch.tensor(node_features, dtype=torch.float) # Edges: Same primary_tumor_site AND pT_stage edge_index = [] edge_attr = [] patient_indices = df.index.tolist() for i in range(len(patient_indices)): for j in range(i + 1, len(patient_indices)): p1 = df.iloc[i] p2 = df.iloc[j] if p1['primary_tumor_site'] == p2['primary_tumor_site'] and p1['pT_stage'] == p2['pT_stage']: # Similarity Weight based on grading and pN_stage # We'll use 1 / (1 + abs(diff)) grading_sim = 1.0 / (1.0 + abs(float(p1['grading']) - float(p2['grading']))) pn_sim = 1.0 / (1.0 + abs(float(p1['pN_stage']) - float(p2['pN_stage']))) weight = (grading_sim + pn_sim) / 2.0 edge_index.append([i, j]) edge_index.append([j, i]) edge_attr.append([weight]) edge_attr.append([weight]) if not edge_index: # Fallback: connect everyone with a very low weight if no twins found # Or just empty edges for now (PyG handles this) edge_index = torch.empty((2, 0), dtype=torch.long) edge_attr = torch.empty((0, 1), dtype=torch.float) else: edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() edge_attr = torch.tensor(edge_attr, dtype=torch.float) # Targets y_recurrence = torch.tensor(df['recurrence_label'].values, dtype=torch.long) y_survival = torch.tensor(df['survival_score'].values, dtype=torch.float).view(-1, 1) data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y_recurrence=y_recurrence, y_survival=y_survival) return data, df, num_cols + cat_cols if __name__ == "__main__": data, df, features = load_and_preprocess_data('../data') print(f"Graph constructed with {data.num_nodes} nodes and {data.num_edges} edges.") print(f"Features: {len(features)}") print(f"Recurrence balance: {df['recurrence_label'].value_counts().to_dict()}")