Spaces:
Sleeping
Sleeping
| import argparse | |
| import os | |
| from typing import Tuple | |
| import numpy as np | |
| import pandas as pd | |
| import xgboost as xgb | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| def load_data(csv_path: str) -> pd.DataFrame: | |
| if not os.path.exists(csv_path): | |
| raise FileNotFoundError(f"CSV not found: {csv_path}") | |
| df = pd.read_csv(csv_path) | |
| if df.shape[1] < 2: | |
| raise ValueError("CSV must have at least 2 columns (target + features)") | |
| return df | |
| def split_encode(df: pd.DataFrame, test_size: float, seed: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, LabelEncoder, list]: | |
| target = df.columns[0] | |
| X = df.iloc[:, 1:] | |
| y = df[target] | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=test_size, random_state=seed, stratify=y | |
| ) | |
| label_encoder = LabelEncoder() | |
| y_train_enc = label_encoder.fit_transform(y_train) | |
| y_test_enc = label_encoder.transform(y_test) | |
| return X_train.values, X_test.values, y_train_enc, y_test_enc, label_encoder, X.columns.tolist() | |
| def build_model(num_classes: int): | |
| common_kwargs = dict( | |
| objective="multi:softprob", | |
| num_class=num_classes, | |
| eval_metric="mlogloss", | |
| tree_method="hist", | |
| n_estimators=300, | |
| max_depth=6, | |
| learning_rate=0.05, | |
| subsample=0.8, | |
| colsample_bytree=0.8, | |
| random_state=42, | |
| ) | |
| try: | |
| model = xgb.XGBClassifier(device="cuda", **common_kwargs) | |
| except TypeError: | |
| try: | |
| model = xgb.XGBClassifier(tree_method="gpu_hist", **{k: v for k, v in common_kwargs.items() if k != "tree_method"}) | |
| except Exception: | |
| model = xgb.XGBClassifier(**common_kwargs) | |
| return model | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Evaluate XGBoost Symptom Checker accuracy") | |
| parser.add_argument("--csv", required=True, help="Path to cleaned CSV (target + binary features)") | |
| parser.add_argument("--test-size", type=float, default=0.2, help="Test set fraction (default 0.2)") | |
| parser.add_argument("--seed", type=int, default=42, help="Random seed (default 42)") | |
| args = parser.parse_args() | |
| print("Loading data...") | |
| df = load_data(args.csv) | |
| print(f"Shape: {df.shape}") | |
| print("Splitting and encoding labels...") | |
| X_train, X_test, y_train, y_test, label_enc, feature_names = split_encode(df, args.test_size, args.seed) | |
| num_classes = len(np.unique(y_train)) | |
| print(f"Classes: {num_classes}; Features: {len(feature_names)}") | |
| print("Training model...") | |
| model = build_model(num_classes) | |
| try: | |
| model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50, early_stopping_rounds=30) | |
| except TypeError: | |
| model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50) | |
| print("Evaluating...") | |
| y_proba = model.predict_proba(X_test) | |
| y_pred = np.argmax(y_proba, axis=1) | |
| acc = accuracy_score(y_test, y_pred) | |
| print(f"\nAccuracy: {acc:.4f} ({acc*100:.2f}%)") | |
| print("\nClassification report:") | |
| target_names = label_enc.inverse_transform(np.arange(num_classes)) | |
| print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0)) | |
| print("Confusion matrix (rows=true, cols=pred):") | |
| cm = confusion_matrix(y_test, y_pred) | |
| print(cm) | |
| if __name__ == "__main__": | |
| main() | |