| | |
| | """symptomtest.ipynb |
| | |
| | Automatically generated by Colaboratory. |
| | |
| | Original file is located at |
| | https://colab.research.google.com/drive/1veojMDDiTQ8mnugWDldDmlkSzrKADR1l |
| | """ |
| |
|
| | import pandas as pd |
| | import re |
| | import spacy |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.pipeline import Pipeline |
| | from sklearn.metrics import accuracy_score, classification_report |
| | from sklearn.linear_model import LogisticRegression |
| |
|
| | |
| | data = pd.read_csv('symptomssingle.csv') |
| |
|
| | |
| | data = data.dropna() |
| |
|
| | |
| | def separate_symptoms_and_diseases(text): |
| | symptoms = re.findall(r'{"symptoms":"(.*?)"}', text) |
| | disease = re.sub(r'(?:{"symptoms":".*?"},?)+', '', text).strip() |
| | disease = disease.replace('],', '').strip() |
| | return symptoms, disease |
| |
|
| | |
| | data['symptoms_and_diseases'] = data['data'].apply(separate_symptoms_and_diseases) |
| | data[['symptoms', 'disease']] = pd.DataFrame(data['symptoms_and_diseases'].tolist(), index=data.index) |
| | data = data.drop(columns=['data', 'symptoms_and_diseases']) |
| |
|
| | |
| | nlp = spacy.load('en_core_web_sm') |
| |
|
| | |
| | def preprocess(symptoms): |
| | processed_symptoms = [] |
| | for symptom in symptoms: |
| | doc = nlp(symptom) |
| | processed_symptom = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha) |
| | processed_symptoms.append(processed_symptom) |
| | return ' '.join(processed_symptoms) |
| |
|
| | |
| | data['symptoms_preprocessed'] = data['symptoms'].apply(preprocess) |
| |
|
| |
|
| | |
| | X_train, X_test, y_train, y_test = train_test_split(data['symptoms_preprocessed'], data['disease'], test_size=0.2, random_state=42) |
| |
|
| | |
| | pipeline = Pipeline([ |
| | ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), |
| | ('classifier', LogisticRegression(solver='liblinear', C=10)) |
| | ]) |
| |
|
| | |
| | pipeline.fit(X_train, y_train) |
| |
|
| | |
| | y_pred = pipeline.predict(X_test) |
| |
|
| | |
| | print("Accuracy: ", accuracy_score(y_test, y_pred)) |
| | print("Classification Report:\n", classification_report(y_test, y_pred)) |
| |
|
| | !pip install joblib |
| | import joblib |
| |
|
| | |
| | joblib.dump(pipeline, 'DiseasePredictionBasedonSymptoms.joblib') |
| |
|
| | import joblib |
| |
|
| | |
| | loaded_pipeline = joblib.load('DiseasePredictionBasedonSymptoms.joblib') |
| |
|
| | |
| | sample_symptom = "low back pain" |
| | processed_symptom = preprocess([sample_symptom]) |
| | prediction = loaded_pipeline.predict([processed_symptom]) |
| |
|
| | print("Predicted disease:", prediction[0]) |