Spaces:
Build error
Build error
File size: 1,345 Bytes
f3b5806 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import xgboost as xgb
# Machine learning model: XGBoost
# import the dataset
dataset_df = pd.read_csv('data/dataset.csv')
# Preprocess
dataset_df = dataset_df.apply(lambda col: col.str.strip())
test = pd.get_dummies(dataset_df.filter(regex='Symptom'), prefix='', prefix_sep='')
test = test.groupby(test.columns, axis=1).agg(np.max)
clean_df = pd.merge(test,dataset_df['Disease'], left_index=True, right_index=True)
clean_df.to_csv('data/clean_dataset.tsv', sep='\t', index=False)
# Preprocessing
X_data = clean_df.iloc[:,:-1]
y_data = clean_df.iloc[:,-1]
# Convert y to categorical values
y_data = y_data.astype('category')
# Convert y categories tu numbers with encoder
le = preprocessing.LabelEncoder()
le.fit(y_data)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
# Convert labels to numbers
y_train = le.transform(y_train)
y_test = le.transform(y_test)
# Init classifier
model = xgb.XGBClassifier()
# Fit
model.fit(X_train, y_train)
# Predict
preds = model.predict(X_test)
# Test accuracy
print(f"The accuracy of the model is {accuracy_score(y_test, preds)}")
# Export model
model.save_model("model/xgboost_model.json")
|