import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn import preprocessing import xgboost as xgb # Machine learning model: XGBoost # import the dataset dataset_df = pd.read_csv('data/dataset.csv') # Preprocess dataset_df = dataset_df.apply(lambda col: col.str.strip()) test = pd.get_dummies(dataset_df.filter(regex='Symptom'), prefix='', prefix_sep='') test = test.groupby(test.columns, axis=1).agg(np.max) clean_df = pd.merge(test,dataset_df['Disease'], left_index=True, right_index=True) clean_df.to_csv('data/clean_dataset.tsv', sep='\t', index=False) # Preprocessing X_data = clean_df.iloc[:,:-1] y_data = clean_df.iloc[:,-1] # Convert y to categorical values y_data = y_data.astype('category') # Convert y categories tu numbers with encoder le = preprocessing.LabelEncoder() le.fit(y_data) X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2) # Convert labels to numbers y_train = le.transform(y_train) y_test = le.transform(y_test) # Init classifier model = xgb.XGBClassifier() # Fit model.fit(X_train, y_train) # Predict preds = model.predict(X_test) # Test accuracy print(f"The accuracy of the model is {accuracy_score(y_test, preds)}") # Export model model.save_model("model/xgboost_model.json")