kartikeyan2003's picture
diseasepredctionBasedonWebapp
f3b5806
import pandas as pd
import numpy as np
# def preprocess_kaggle(dataset_path):
# # import the dataset
# dataset_df = pd.read_csv(dataset_path)
# # Preprocess
# dataset_df = dataset_df.apply(lambda col: col.str.strip())
# test = pd.get_dummies(dataset_df.filter(regex='Symptom'), prefix='', prefix_sep='')
# test = test.groupby(test.columns, axis=1).agg(np.max)
# clean_df = pd.merge(test,dataset_df['Disease'], left_index=True, right_index=True)
# return clean_df
def prepare_symptoms_array(symptoms):
'''
Convert a list of symptoms to a ndim(X) (in this case 131) that matches the
dataframe used to train the machine learning model
Output:
- X (np.array) = X values ready as input to ML model to get prediction
'''
symptoms_array = np.zeros((1,133))
df = pd.read_csv('data/clean_dataset.tsv', sep='\t')
for symptom in symptoms:
symptom_idx = df.columns.get_loc(symptom)
symptoms_array[0, symptom_idx] = 1
return symptoms_array