Text_classification / evaluate_symptom_checker.py
Tantawi's picture
Upload 13 files
f2a4578 verified
import argparse
import os
from typing import Tuple
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def load_data(csv_path: str) -> pd.DataFrame:
if not os.path.exists(csv_path):
raise FileNotFoundError(f"CSV not found: {csv_path}")
df = pd.read_csv(csv_path)
if df.shape[1] < 2:
raise ValueError("CSV must have at least 2 columns (target + features)")
return df
def split_encode(df: pd.DataFrame, test_size: float, seed: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, LabelEncoder, list]:
target = df.columns[0]
X = df.iloc[:, 1:]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=seed, stratify=y
)
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)
return X_train.values, X_test.values, y_train_enc, y_test_enc, label_encoder, X.columns.tolist()
def build_model(num_classes: int):
common_kwargs = dict(
objective="multi:softprob",
num_class=num_classes,
eval_metric="mlogloss",
tree_method="hist",
n_estimators=300,
max_depth=6,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
)
try:
model = xgb.XGBClassifier(device="cuda", **common_kwargs)
except TypeError:
try:
model = xgb.XGBClassifier(tree_method="gpu_hist", **{k: v for k, v in common_kwargs.items() if k != "tree_method"})
except Exception:
model = xgb.XGBClassifier(**common_kwargs)
return model
def main():
parser = argparse.ArgumentParser(description="Evaluate XGBoost Symptom Checker accuracy")
parser.add_argument("--csv", required=True, help="Path to cleaned CSV (target + binary features)")
parser.add_argument("--test-size", type=float, default=0.2, help="Test set fraction (default 0.2)")
parser.add_argument("--seed", type=int, default=42, help="Random seed (default 42)")
args = parser.parse_args()
print("Loading data...")
df = load_data(args.csv)
print(f"Shape: {df.shape}")
print("Splitting and encoding labels...")
X_train, X_test, y_train, y_test, label_enc, feature_names = split_encode(df, args.test_size, args.seed)
num_classes = len(np.unique(y_train))
print(f"Classes: {num_classes}; Features: {len(feature_names)}")
print("Training model...")
model = build_model(num_classes)
try:
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50, early_stopping_rounds=30)
except TypeError:
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50)
print("Evaluating...")
y_proba = model.predict_proba(X_test)
y_pred = np.argmax(y_proba, axis=1)
acc = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {acc:.4f} ({acc*100:.2f}%)")
print("\nClassification report:")
target_names = label_enc.inverse_transform(np.arange(num_classes))
print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))
print("Confusion matrix (rows=true, cols=pred):")
cm = confusion_matrix(y_test, y_pred)
print(cm)
if __name__ == "__main__":
main()