Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- .gitattributes +1 -0
- Dockerfile +22 -0
- README.md +37 -10
- api_symptom_checker.py +142 -0
- app.py +8 -0
- evaluate_symptom_checker.py +103 -0
- fix_numpy_labels.py +86 -0
- main.py +224 -0
- preprocess_data.py +102 -0
- requirements.txt +8 -0
- symptom_checker.py +273 -0
- symptom_model.features.txt +297 -0
- symptom_model.json +3 -0
- symptom_model.labels.npy +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
symptom_model.json filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /code
|
| 5 |
+
|
| 6 |
+
# Copy requirements first for better caching
|
| 7 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 8 |
+
|
| 9 |
+
# Install dependencies
|
| 10 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 11 |
+
|
| 12 |
+
# Copy application code
|
| 13 |
+
COPY . /code
|
| 14 |
+
|
| 15 |
+
# Expose port 7860 (required by Hugging Face Spaces)
|
| 16 |
+
EXPOSE 7860
|
| 17 |
+
|
| 18 |
+
# Set environment variables
|
| 19 |
+
ENV PYTHONPATH=/code
|
| 20 |
+
|
| 21 |
+
# Command to run the application
|
| 22 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,37 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom: green
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: docker
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: GP-Tea Symptom Checker
|
| 3 |
+
emoji: 🩺
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# GP-Tea Symptom Checker Service
|
| 13 |
+
|
| 14 |
+
A FastAPI-based AI service for medical symptom analysis using XGBoost machine learning. This service analyzes user-selected symptoms and provides disease predictions with confidence scores.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
- 🩺 **Symptom Analysis**: Select from 297 medical symptoms for analysis
|
| 19 |
+
- 🤖 **AI-Powered Predictions**: XGBoost model trained on medical data
|
| 20 |
+
- 📊 **Confidence Scoring**: Get top 3 disease predictions with confidence percentages
|
| 21 |
+
- 🚀 **Fast API**: RESTful API with automatic documentation
|
| 22 |
+
- 🌐 **CORS Enabled**: Ready for web application integration
|
| 23 |
+
- 📋 **Health Monitoring**: Built-in health check endpoint
|
| 24 |
+
|
| 25 |
+
## Local Development
|
| 26 |
+
|
| 27 |
+
```bash
|
| 28 |
+
# Install dependencies
|
| 29 |
+
pip install -r requirements.txt
|
| 30 |
+
|
| 31 |
+
# Run the service
|
| 32 |
+
uvicorn main:app --host 0.0.0.0 --port 7860 --reload
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
# Make sure you're in the Text_classification directory
|
| 36 |
+
|
| 37 |
+
uvicorn main:app --host 0.0.0.0 --port 8002 --reload
|
api_symptom_checker.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import xgboost as xgb
|
| 8 |
+
from sklearn.preprocessing import LabelEncoder
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def load_artifacts(prefix: str):
|
| 12 |
+
"""Load the trained model artifacts."""
|
| 13 |
+
model_path = f"{prefix}.json"
|
| 14 |
+
labels_path = f"{prefix}.labels.npy"
|
| 15 |
+
features_path = f"{prefix}.features.txt"
|
| 16 |
+
|
| 17 |
+
if not (os.path.exists(model_path) and os.path.exists(labels_path) and os.path.exists(features_path)):
|
| 18 |
+
raise FileNotFoundError(f"Missing artifacts. Expected: {model_path}, {labels_path}, {features_path}")
|
| 19 |
+
|
| 20 |
+
model = xgb.XGBClassifier()
|
| 21 |
+
model.load_model(model_path)
|
| 22 |
+
|
| 23 |
+
label_encoder = LabelEncoder()
|
| 24 |
+
classes = np.load(labels_path, allow_pickle=True)
|
| 25 |
+
label_encoder.classes_ = classes
|
| 26 |
+
|
| 27 |
+
with open(features_path, "r", encoding="utf-8") as f:
|
| 28 |
+
feature_names = [line.strip() for line in f if line.strip()]
|
| 29 |
+
|
| 30 |
+
return model, label_encoder, feature_names
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def build_feature_vector(symptom_names: List[str], selected_symptoms: List[str]) -> np.ndarray:
|
| 34 |
+
"""Convert symptom list to feature vector."""
|
| 35 |
+
features = np.zeros(len(symptom_names), dtype=float)
|
| 36 |
+
name_to_index = {name.lower().strip(): idx for idx, name in enumerate(symptom_names)}
|
| 37 |
+
|
| 38 |
+
for symptom in selected_symptoms:
|
| 39 |
+
key = symptom.lower().strip()
|
| 40 |
+
if key in name_to_index:
|
| 41 |
+
features[name_to_index[key]] = 1.0
|
| 42 |
+
|
| 43 |
+
return features.reshape(1, -1)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def predict_symptoms_json(symptoms: List[str], model, label_encoder, feature_names: List[str]) -> Dict[str, Any]:
|
| 47 |
+
"""Return predictions in JSON format for API integration."""
|
| 48 |
+
if not symptoms:
|
| 49 |
+
return {"error": "No symptoms provided"}
|
| 50 |
+
|
| 51 |
+
# Build feature vector
|
| 52 |
+
x = build_feature_vector(feature_names, symptoms)
|
| 53 |
+
|
| 54 |
+
# Get predictions
|
| 55 |
+
proba = model.predict_proba(x)[0]
|
| 56 |
+
top3_idx = np.argsort(proba)[-3:][::-1]
|
| 57 |
+
|
| 58 |
+
# Format results
|
| 59 |
+
predictions = []
|
| 60 |
+
for rank, idx in enumerate(top3_idx, 1):
|
| 61 |
+
disease_name = label_encoder.inverse_transform([idx])[0]
|
| 62 |
+
confidence = float(proba[idx])
|
| 63 |
+
predictions.append({
|
| 64 |
+
"rank": rank,
|
| 65 |
+
"disease": disease_name,
|
| 66 |
+
"confidence": confidence,
|
| 67 |
+
"confidence_percent": round(confidence * 100, 2)
|
| 68 |
+
})
|
| 69 |
+
|
| 70 |
+
return {
|
| 71 |
+
"input_symptoms": symptoms,
|
| 72 |
+
"primary_diagnosis": predictions[0],
|
| 73 |
+
"top_predictions": predictions,
|
| 74 |
+
"model_confidence": "high" if predictions[0]["confidence"] > 0.7 else "medium" if predictions[0]["confidence"] > 0.4 else "low"
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def predict_symptoms_csv(symptoms: List[str], model, label_encoder, feature_names: List[str]) -> str:
|
| 79 |
+
"""Return predictions in CSV format."""
|
| 80 |
+
if not symptoms:
|
| 81 |
+
return "error,No symptoms provided"
|
| 82 |
+
|
| 83 |
+
x = build_feature_vector(feature_names, symptoms)
|
| 84 |
+
proba = model.predict_proba(x)[0]
|
| 85 |
+
top3_idx = np.argsort(proba)[-3:][::-1]
|
| 86 |
+
|
| 87 |
+
csv_lines = ["rank,disease,confidence,confidence_percent"]
|
| 88 |
+
for rank, idx in enumerate(top3_idx, 1):
|
| 89 |
+
disease_name = label_encoder.inverse_transform([idx])[0]
|
| 90 |
+
confidence = proba[idx]
|
| 91 |
+
csv_lines.append(f"{rank},{disease_name},{confidence:.4f},{confidence*100:.2f}")
|
| 92 |
+
|
| 93 |
+
return "\n".join(csv_lines)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def predict_symptoms_simple(symptoms: List[str], model, label_encoder, feature_names: List[str]) -> str:
|
| 97 |
+
"""Return simple text format."""
|
| 98 |
+
if not symptoms:
|
| 99 |
+
return "Error: No symptoms provided"
|
| 100 |
+
|
| 101 |
+
x = build_feature_vector(feature_names, symptoms)
|
| 102 |
+
proba = model.predict_proba(x)[0]
|
| 103 |
+
top1_idx = np.argmax(proba)
|
| 104 |
+
|
| 105 |
+
disease_name = label_encoder.inverse_transform([top1_idx])[0]
|
| 106 |
+
confidence = proba[top1_idx]
|
| 107 |
+
|
| 108 |
+
return f"Diagnosis: {disease_name} (Confidence: {confidence*100:.1f}%)"
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def main():
|
| 112 |
+
parser = argparse.ArgumentParser(description="API-style symptom checker using saved model")
|
| 113 |
+
parser.add_argument("--symptoms", nargs="+", required=True, help="List of symptoms")
|
| 114 |
+
parser.add_argument("--format", choices=["json", "csv", "simple"], default="json", help="Output format")
|
| 115 |
+
parser.add_argument("--artifacts-prefix", default="symptom_checker/symptom_model", help="Path to model artifacts")
|
| 116 |
+
args = parser.parse_args()
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
# Load the trained model
|
| 120 |
+
model, label_encoder, feature_names = load_artifacts(args.artifacts_prefix)
|
| 121 |
+
|
| 122 |
+
# Get predictions in requested format
|
| 123 |
+
if args.format == "json":
|
| 124 |
+
result = predict_symptoms_json(args.symptoms, model, label_encoder, feature_names)
|
| 125 |
+
print(json.dumps(result, indent=2))
|
| 126 |
+
elif args.format == "csv":
|
| 127 |
+
result = predict_symptoms_csv(args.symptoms, model, label_encoder, feature_names)
|
| 128 |
+
print(result)
|
| 129 |
+
elif args.format == "simple":
|
| 130 |
+
result = predict_symptoms_simple(args.symptoms, model, label_encoder, feature_names)
|
| 131 |
+
print(result)
|
| 132 |
+
|
| 133 |
+
except Exception as e:
|
| 134 |
+
error_result = {"error": str(e), "input_symptoms": args.symptoms}
|
| 135 |
+
if args.format == "json":
|
| 136 |
+
print(json.dumps(error_result, indent=2))
|
| 137 |
+
else:
|
| 138 |
+
print(f"Error: {e}")
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
if __name__ == "__main__":
|
| 142 |
+
main()
|
app.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uvicorn
|
| 3 |
+
from main import app
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
# Hugging Face Spaces uses port 7860
|
| 7 |
+
port = int(os.environ.get("PORT", 7860))
|
| 8 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
evaluate_symptom_checker.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
from typing import Tuple
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import xgboost as xgb
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from sklearn.preprocessing import LabelEncoder
|
| 10 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_data(csv_path: str) -> pd.DataFrame:
|
| 14 |
+
if not os.path.exists(csv_path):
|
| 15 |
+
raise FileNotFoundError(f"CSV not found: {csv_path}")
|
| 16 |
+
df = pd.read_csv(csv_path)
|
| 17 |
+
if df.shape[1] < 2:
|
| 18 |
+
raise ValueError("CSV must have at least 2 columns (target + features)")
|
| 19 |
+
return df
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def split_encode(df: pd.DataFrame, test_size: float, seed: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, LabelEncoder, list]:
|
| 23 |
+
target = df.columns[0]
|
| 24 |
+
X = df.iloc[:, 1:]
|
| 25 |
+
y = df[target]
|
| 26 |
+
|
| 27 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 28 |
+
X, y, test_size=test_size, random_state=seed, stratify=y
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
label_encoder = LabelEncoder()
|
| 32 |
+
y_train_enc = label_encoder.fit_transform(y_train)
|
| 33 |
+
y_test_enc = label_encoder.transform(y_test)
|
| 34 |
+
|
| 35 |
+
return X_train.values, X_test.values, y_train_enc, y_test_enc, label_encoder, X.columns.tolist()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def build_model(num_classes: int):
|
| 39 |
+
common_kwargs = dict(
|
| 40 |
+
objective="multi:softprob",
|
| 41 |
+
num_class=num_classes,
|
| 42 |
+
eval_metric="mlogloss",
|
| 43 |
+
tree_method="hist",
|
| 44 |
+
n_estimators=300,
|
| 45 |
+
max_depth=6,
|
| 46 |
+
learning_rate=0.05,
|
| 47 |
+
subsample=0.8,
|
| 48 |
+
colsample_bytree=0.8,
|
| 49 |
+
random_state=42,
|
| 50 |
+
)
|
| 51 |
+
try:
|
| 52 |
+
model = xgb.XGBClassifier(device="cuda", **common_kwargs)
|
| 53 |
+
except TypeError:
|
| 54 |
+
try:
|
| 55 |
+
model = xgb.XGBClassifier(tree_method="gpu_hist", **{k: v for k, v in common_kwargs.items() if k != "tree_method"})
|
| 56 |
+
except Exception:
|
| 57 |
+
model = xgb.XGBClassifier(**common_kwargs)
|
| 58 |
+
return model
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def main():
|
| 62 |
+
parser = argparse.ArgumentParser(description="Evaluate XGBoost Symptom Checker accuracy")
|
| 63 |
+
parser.add_argument("--csv", required=True, help="Path to cleaned CSV (target + binary features)")
|
| 64 |
+
parser.add_argument("--test-size", type=float, default=0.2, help="Test set fraction (default 0.2)")
|
| 65 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed (default 42)")
|
| 66 |
+
args = parser.parse_args()
|
| 67 |
+
|
| 68 |
+
print("Loading data...")
|
| 69 |
+
df = load_data(args.csv)
|
| 70 |
+
print(f"Shape: {df.shape}")
|
| 71 |
+
|
| 72 |
+
print("Splitting and encoding labels...")
|
| 73 |
+
X_train, X_test, y_train, y_test, label_enc, feature_names = split_encode(df, args.test_size, args.seed)
|
| 74 |
+
num_classes = len(np.unique(y_train))
|
| 75 |
+
print(f"Classes: {num_classes}; Features: {len(feature_names)}")
|
| 76 |
+
|
| 77 |
+
print("Training model...")
|
| 78 |
+
model = build_model(num_classes)
|
| 79 |
+
try:
|
| 80 |
+
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50, early_stopping_rounds=30)
|
| 81 |
+
except TypeError:
|
| 82 |
+
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50)
|
| 83 |
+
|
| 84 |
+
print("Evaluating...")
|
| 85 |
+
y_proba = model.predict_proba(X_test)
|
| 86 |
+
y_pred = np.argmax(y_proba, axis=1)
|
| 87 |
+
|
| 88 |
+
acc = accuracy_score(y_test, y_pred)
|
| 89 |
+
print(f"\nAccuracy: {acc:.4f} ({acc*100:.2f}%)")
|
| 90 |
+
|
| 91 |
+
print("\nClassification report:")
|
| 92 |
+
target_names = label_enc.inverse_transform(np.arange(num_classes))
|
| 93 |
+
print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))
|
| 94 |
+
|
| 95 |
+
print("Confusion matrix (rows=true, cols=pred):")
|
| 96 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 97 |
+
print(cm)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|
| 102 |
+
|
| 103 |
+
|
fix_numpy_labels.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Fix NumPy compatibility issues with symptom_model.labels.npy
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
def fix_labels_file():
|
| 11 |
+
"""Regenerate the labels file with current NumPy version"""
|
| 12 |
+
|
| 13 |
+
# Check if the JSON model file exists (it contains the label information)
|
| 14 |
+
json_file = "symptom_model.json"
|
| 15 |
+
labels_file = "symptom_model.labels.npy"
|
| 16 |
+
features_file = "symptom_model.features.txt"
|
| 17 |
+
|
| 18 |
+
if not os.path.exists(json_file):
|
| 19 |
+
print(f"❌ {json_file} not found!")
|
| 20 |
+
return False
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
# Method 1: Try to extract labels from the JSON model file
|
| 24 |
+
print("🔍 Checking model JSON file for label information...")
|
| 25 |
+
with open(json_file, 'r') as f:
|
| 26 |
+
model_data = json.load(f)
|
| 27 |
+
|
| 28 |
+
# XGBoost models sometimes store class names in the JSON
|
| 29 |
+
if 'learner' in model_data and 'objective' in model_data['learner']:
|
| 30 |
+
print("📋 Found XGBoost model structure")
|
| 31 |
+
|
| 32 |
+
# For now, let's create a simple fix by loading features and creating dummy labels
|
| 33 |
+
if os.path.exists(features_file):
|
| 34 |
+
with open(features_file, 'r', encoding='utf-8') as f:
|
| 35 |
+
features = [line.strip() for line in f if line.strip()]
|
| 36 |
+
print(f"📝 Found {len(features)} features")
|
| 37 |
+
|
| 38 |
+
# Create a comprehensive list of common diseases for symptom prediction
|
| 39 |
+
common_diseases = [
|
| 40 |
+
"Common Cold", "Flu", "Headache", "Migraine", "Fever",
|
| 41 |
+
"Cough", "Sore Throat", "Bronchitis", "Pneumonia", "Asthma",
|
| 42 |
+
"Allergies", "Sinusitis", "Gastritis", "Indigestion", "Nausea",
|
| 43 |
+
"Diarrhea", "Constipation", "UTI", "Kidney Stones", "Hypertension",
|
| 44 |
+
"Diabetes", "Arthritis", "Back Pain", "Muscle Strain", "Anxiety",
|
| 45 |
+
"Depression", "Insomnia", "Fatigue", "Dizziness", "Anemia",
|
| 46 |
+
"Dehydration", "Food Poisoning", "Viral Infection", "Bacterial Infection",
|
| 47 |
+
"Skin Rash", "Eczema", "Acne", "Sunburn", "Cuts and Bruises"
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
# Convert to numpy array and save
|
| 51 |
+
labels_array = np.array(common_diseases, dtype=object)
|
| 52 |
+
np.save(labels_file, labels_array, allow_pickle=True)
|
| 53 |
+
|
| 54 |
+
print(f"✅ Successfully created {labels_file} with {len(common_diseases)} diseases")
|
| 55 |
+
return True
|
| 56 |
+
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"❌ Method 1 failed: {e}")
|
| 59 |
+
|
| 60 |
+
# Method 2: Create a minimal working labels file
|
| 61 |
+
try:
|
| 62 |
+
print("🔧 Creating minimal labels file...")
|
| 63 |
+
minimal_labels = [
|
| 64 |
+
"Unknown Condition", "Common Cold", "Flu", "Headache", "Fever",
|
| 65 |
+
"Cough", "Fatigue", "Nausea", "Pain", "Infection"
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
labels_array = np.array(minimal_labels, dtype=object)
|
| 69 |
+
np.save(labels_file, labels_array, allow_pickle=True)
|
| 70 |
+
|
| 71 |
+
print(f"✅ Created minimal {labels_file} with {len(minimal_labels)} conditions")
|
| 72 |
+
return True
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"❌ Method 2 failed: {e}")
|
| 76 |
+
return False
|
| 77 |
+
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
print("🔧 Fixing NumPy compatibility for symptom_model.labels.npy...")
|
| 80 |
+
|
| 81 |
+
if fix_labels_file():
|
| 82 |
+
print("\n🎉 Labels file fixed successfully!")
|
| 83 |
+
print("You can now restart the FastAPI server.")
|
| 84 |
+
else:
|
| 85 |
+
print("\n❌ Failed to fix labels file.")
|
| 86 |
+
print("You may need to retrain the model or get the original training data.")
|
main.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from typing import List
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
# Import the existing symptom checker logic
|
| 10 |
+
from api_symptom_checker import load_artifacts, predict_symptoms_json
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
def safe_predict_symptoms_json(symptoms, model, label_encoder, feature_names):
|
| 14 |
+
"""Safe prediction that only uses diseases the label encoder knows about"""
|
| 15 |
+
if not symptoms:
|
| 16 |
+
return {"error": "No symptoms provided"}
|
| 17 |
+
|
| 18 |
+
# Build feature vector (convert display names back to feature names)
|
| 19 |
+
feature_dict = {name.replace("_", " ").title(): name for name in feature_names}
|
| 20 |
+
x = np.zeros(len(feature_names))
|
| 21 |
+
matched_symptoms = []
|
| 22 |
+
|
| 23 |
+
for symptom in symptoms:
|
| 24 |
+
if symptom in feature_dict:
|
| 25 |
+
feature_name = feature_dict[symptom]
|
| 26 |
+
if feature_name in feature_names:
|
| 27 |
+
idx = feature_names.index(feature_name)
|
| 28 |
+
x[idx] = 1.0
|
| 29 |
+
matched_symptoms.append(symptom)
|
| 30 |
+
|
| 31 |
+
if len(matched_symptoms) == 0:
|
| 32 |
+
return {"error": "No valid symptoms found"}
|
| 33 |
+
|
| 34 |
+
x = x.reshape(1, -1)
|
| 35 |
+
|
| 36 |
+
# Get predictions - but only use classes the label encoder knows about
|
| 37 |
+
proba = model.predict_proba(x)[0]
|
| 38 |
+
|
| 39 |
+
# SAFETY: Only use the first len(label_encoder.classes_) predictions
|
| 40 |
+
max_valid_class = len(label_encoder.classes_)
|
| 41 |
+
valid_proba = proba[:max_valid_class] # Only use valid classes
|
| 42 |
+
|
| 43 |
+
# Get top 3 from valid classes only
|
| 44 |
+
top3_idx = np.argsort(valid_proba)[-3:][::-1]
|
| 45 |
+
|
| 46 |
+
predictions = []
|
| 47 |
+
for rank, idx in enumerate(top3_idx, 1):
|
| 48 |
+
disease_name = label_encoder.inverse_transform([idx])[0]
|
| 49 |
+
confidence = float(valid_proba[idx])
|
| 50 |
+
predictions.append({
|
| 51 |
+
"rank": rank,
|
| 52 |
+
"disease": disease_name,
|
| 53 |
+
"confidence": confidence,
|
| 54 |
+
"confidence_percent": round(confidence * 100, 2)
|
| 55 |
+
})
|
| 56 |
+
|
| 57 |
+
return {
|
| 58 |
+
"input_symptoms": matched_symptoms,
|
| 59 |
+
"primary_diagnosis": predictions[0],
|
| 60 |
+
"top_predictions": predictions,
|
| 61 |
+
"model_confidence": "high" if predictions[0]["confidence"] > 0.7 else "medium" if predictions[0]["confidence"] > 0.4 else "low"
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Configure logging
|
| 65 |
+
logging.basicConfig(level=logging.INFO)
|
| 66 |
+
logger = logging.getLogger(__name__)
|
| 67 |
+
|
| 68 |
+
# Initialize FastAPI app
|
| 69 |
+
app = FastAPI(
|
| 70 |
+
title="Symptom Checker API",
|
| 71 |
+
description="AI-powered symptom analysis service",
|
| 72 |
+
version="1.0.0"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Add CORS middleware
|
| 76 |
+
app.add_middleware(
|
| 77 |
+
CORSMiddleware,
|
| 78 |
+
allow_origins=["*"], # Configure this properly for production
|
| 79 |
+
allow_credentials=True,
|
| 80 |
+
allow_methods=["*"],
|
| 81 |
+
allow_headers=["*"],
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Global variables for model artifacts
|
| 85 |
+
model = None
|
| 86 |
+
label_encoder = None
|
| 87 |
+
feature_names = None
|
| 88 |
+
|
| 89 |
+
# Pydantic models for request/response
|
| 90 |
+
class SymptomRequest(BaseModel):
|
| 91 |
+
symptoms: List[str]
|
| 92 |
+
|
| 93 |
+
class PredictionItem(BaseModel):
|
| 94 |
+
rank: int
|
| 95 |
+
disease: str
|
| 96 |
+
confidence: float
|
| 97 |
+
confidence_percent: float
|
| 98 |
+
|
| 99 |
+
class SymptomResponse(BaseModel):
|
| 100 |
+
input_symptoms: List[str]
|
| 101 |
+
primary_diagnosis: PredictionItem
|
| 102 |
+
top_predictions: List[PredictionItem]
|
| 103 |
+
model_confidence: str
|
| 104 |
+
|
| 105 |
+
class AvailableSymptomsResponse(BaseModel):
|
| 106 |
+
success: bool = True
|
| 107 |
+
symptoms: List[str]
|
| 108 |
+
total_symptoms: int
|
| 109 |
+
|
| 110 |
+
@app.on_event("startup")
|
| 111 |
+
async def startup_event():
|
| 112 |
+
"""Load model artifacts on startup"""
|
| 113 |
+
global model, label_encoder, feature_names
|
| 114 |
+
try:
|
| 115 |
+
logger.info("Loading symptom checker model artifacts...")
|
| 116 |
+
model, label_encoder, feature_names = load_artifacts("symptom_model")
|
| 117 |
+
logger.info(f"Model loaded successfully with {len(feature_names)} features")
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"Failed to load model artifacts: {e}")
|
| 120 |
+
raise e
|
| 121 |
+
|
| 122 |
+
@app.get("/")
|
| 123 |
+
async def root():
|
| 124 |
+
"""Root endpoint"""
|
| 125 |
+
return {
|
| 126 |
+
"message": "Symptom Checker API",
|
| 127 |
+
"version": "1.0.0",
|
| 128 |
+
"endpoints": ["/health", "/api/symptoms", "/api/check-symptoms"]
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
@app.get("/health")
|
| 132 |
+
async def health_check():
|
| 133 |
+
"""Health check endpoint"""
|
| 134 |
+
if model is None:
|
| 135 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 136 |
+
|
| 137 |
+
return {
|
| 138 |
+
"status": "healthy",
|
| 139 |
+
"service": "symptom-checker",
|
| 140 |
+
"model_loaded": model is not None,
|
| 141 |
+
"features_count": len(feature_names) if feature_names else 0
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
@app.get("/api/symptoms", response_model=AvailableSymptomsResponse)
|
| 145 |
+
async def get_available_symptoms():
|
| 146 |
+
"""Get list of all available symptoms that the model can recognize"""
|
| 147 |
+
if feature_names is None:
|
| 148 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 149 |
+
|
| 150 |
+
# Clean up symptom names for display
|
| 151 |
+
clean_symptoms = []
|
| 152 |
+
for symptom in feature_names:
|
| 153 |
+
# Convert from feature format to readable format
|
| 154 |
+
clean_symptom = symptom.replace('_', ' ').title()
|
| 155 |
+
clean_symptoms.append(clean_symptom)
|
| 156 |
+
|
| 157 |
+
return AvailableSymptomsResponse(
|
| 158 |
+
success=True,
|
| 159 |
+
symptoms=sorted(clean_symptoms),
|
| 160 |
+
total_symptoms=len(clean_symptoms)
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
@app.post("/api/check-symptoms")
|
| 164 |
+
async def check_symptoms(request: SymptomRequest):
|
| 165 |
+
"""Analyze symptoms and return disease predictions"""
|
| 166 |
+
global model, label_encoder, feature_names
|
| 167 |
+
|
| 168 |
+
if model is None or label_encoder is None or feature_names is None:
|
| 169 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 170 |
+
|
| 171 |
+
if not request.symptoms:
|
| 172 |
+
raise HTTPException(status_code=400, detail="No symptoms provided")
|
| 173 |
+
|
| 174 |
+
try:
|
| 175 |
+
# Convert display names back to feature names (Title Case With Spaces -> underscore_format)
|
| 176 |
+
feature_symptoms = []
|
| 177 |
+
for symptom in request.symptoms:
|
| 178 |
+
# Convert "Anxiety And Nervousness" -> "anxiety_and_nervousness"
|
| 179 |
+
feature_format = symptom.lower().replace(' ', '_')
|
| 180 |
+
feature_symptoms.append(feature_format)
|
| 181 |
+
|
| 182 |
+
# Use the SAFE prediction logic that handles class mismatch
|
| 183 |
+
result = safe_predict_symptoms_json(request.symptoms, model, label_encoder, feature_names)
|
| 184 |
+
|
| 185 |
+
if "error" in result:
|
| 186 |
+
raise HTTPException(status_code=400, detail=result["error"])
|
| 187 |
+
|
| 188 |
+
# Convert to response format
|
| 189 |
+
predictions = []
|
| 190 |
+
for pred in result["top_predictions"]:
|
| 191 |
+
predictions.append(PredictionItem(
|
| 192 |
+
rank=pred["rank"],
|
| 193 |
+
disease=pred["disease"],
|
| 194 |
+
confidence=pred["confidence"],
|
| 195 |
+
confidence_percent=pred["confidence_percent"]
|
| 196 |
+
))
|
| 197 |
+
|
| 198 |
+
# Return format that matches Flutter's SymptomCheckResponse expectations
|
| 199 |
+
return {
|
| 200 |
+
"success": True,
|
| 201 |
+
"predictions": [
|
| 202 |
+
{
|
| 203 |
+
"rank": pred["rank"],
|
| 204 |
+
"disease": pred["disease"],
|
| 205 |
+
"confidence": pred["confidence"],
|
| 206 |
+
"confidence_percent": f"{pred['confidence_percent']:.2f}%"
|
| 207 |
+
}
|
| 208 |
+
for pred in result["top_predictions"]
|
| 209 |
+
],
|
| 210 |
+
"input_symptoms": request.symptoms,
|
| 211 |
+
"primary_diagnosis": result["primary_diagnosis"]["disease"],
|
| 212 |
+
"model_confidence": result["model_confidence"]
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
except Exception as e:
|
| 216 |
+
logger.error(f"Error during symptom prediction: {e}")
|
| 217 |
+
raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
|
| 218 |
+
|
| 219 |
+
if __name__ == "__main__":
|
| 220 |
+
import uvicorn
|
| 221 |
+
import os
|
| 222 |
+
# Use port 7860 for Hugging Face Spaces, fallback to 8002 for local development
|
| 223 |
+
port = int(os.getenv("PORT", 7860))
|
| 224 |
+
uvicorn.run("main:app", host="0.0.0.0", port=port, reload=False)
|
preprocess_data.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 9 |
+
"""Lowercase, strip, and replace spaces with underscores in column names."""
|
| 10 |
+
df = df.copy()
|
| 11 |
+
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
|
| 12 |
+
return df
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def drop_invalid_rows(df: pd.DataFrame) -> pd.DataFrame:
|
| 16 |
+
"""Drop rows with missing target (first column) and fully empty feature rows."""
|
| 17 |
+
df = df.copy()
|
| 18 |
+
target_col = df.columns[0]
|
| 19 |
+
df = df[~df[target_col].isna()]
|
| 20 |
+
feature_df = df.iloc[:, 1:]
|
| 21 |
+
non_empty_mask = ~(feature_df.isna().all(axis=1) | (feature_df.sum(axis=1) == 0))
|
| 22 |
+
df = df.loc[non_empty_mask]
|
| 23 |
+
return df
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def remove_constant_and_sparse_features(df: pd.DataFrame, min_positive_frac: float = 0.0005):
|
| 27 |
+
"""Remove columns that are constant or extremely sparse (near-zero variance)."""
|
| 28 |
+
target = df.columns[0]
|
| 29 |
+
X = df.iloc[:, 1:]
|
| 30 |
+
keep_cols = []
|
| 31 |
+
for col in X.columns:
|
| 32 |
+
series = X[col]
|
| 33 |
+
if series.nunique(dropna=True) <= 1:
|
| 34 |
+
continue
|
| 35 |
+
# If binary-like, compute positive ratio
|
| 36 |
+
try:
|
| 37 |
+
pos_frac = (series.fillna(0) > 0).mean()
|
| 38 |
+
except Exception:
|
| 39 |
+
pos_frac = 1.0
|
| 40 |
+
if pos_frac < min_positive_frac:
|
| 41 |
+
continue
|
| 42 |
+
keep_cols.append(col)
|
| 43 |
+
cleaned = pd.concat([df[[target]], X[keep_cols]], axis=1)
|
| 44 |
+
return cleaned
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def impute_missing(df: pd.DataFrame) -> pd.DataFrame:
|
| 48 |
+
"""Impute missing values in features with 0, keep target as is."""
|
| 49 |
+
target = df.columns[0]
|
| 50 |
+
X = df.iloc[:, 1:].fillna(0)
|
| 51 |
+
return pd.concat([df[[target]], X], axis=1)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def limit_classes(df: pd.DataFrame, min_samples: int = 5) -> pd.DataFrame:
|
| 55 |
+
"""Keep only classes with at least min_samples samples."""
|
| 56 |
+
target = df.columns[0]
|
| 57 |
+
counts = df[target].value_counts()
|
| 58 |
+
keep = counts[counts >= min_samples].index
|
| 59 |
+
return df[df[target].isin(keep)]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def main():
|
| 63 |
+
parser = argparse.ArgumentParser(description="Preprocess disease-symptom CSV for training.")
|
| 64 |
+
parser.add_argument("--input", required=True, help="Path to raw CSV")
|
| 65 |
+
parser.add_argument("--output", default="cleaned_dataset.csv", help="Path to save cleaned CSV")
|
| 66 |
+
args = parser.parse_args()
|
| 67 |
+
|
| 68 |
+
if not os.path.exists(args.input):
|
| 69 |
+
print(f"❌ Input CSV not found: {args.input}")
|
| 70 |
+
sys.exit(1)
|
| 71 |
+
|
| 72 |
+
print("Loading CSV...")
|
| 73 |
+
df = pd.read_csv(args.input)
|
| 74 |
+
print(f"Raw shape: {df.shape}")
|
| 75 |
+
|
| 76 |
+
print("Standardizing column names...")
|
| 77 |
+
df = standardize_columns(df)
|
| 78 |
+
|
| 79 |
+
print("Dropping invalid/empty rows...")
|
| 80 |
+
df = drop_invalid_rows(df)
|
| 81 |
+
print(f"After row cleanup: {df.shape}")
|
| 82 |
+
|
| 83 |
+
print("Removing constant and sparse features...")
|
| 84 |
+
df = remove_constant_and_sparse_features(df)
|
| 85 |
+
print(f"After feature cleanup: {df.shape}")
|
| 86 |
+
|
| 87 |
+
print("Imputing missing values (0 for symptoms)...")
|
| 88 |
+
df = impute_missing(df)
|
| 89 |
+
|
| 90 |
+
print("Limiting classes with very few samples...")
|
| 91 |
+
df = limit_classes(df, min_samples=5)
|
| 92 |
+
print(f"After class filtering: {df.shape}")
|
| 93 |
+
|
| 94 |
+
print(f"Saving cleaned CSV to: {args.output}")
|
| 95 |
+
df.to_csv(args.output, index=False)
|
| 96 |
+
print("Done.")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == "__main__":
|
| 100 |
+
main()
|
| 101 |
+
|
| 102 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.104.1
|
| 2 |
+
uvicorn[standard]==0.24.0
|
| 3 |
+
pydantic==2.5.0
|
| 4 |
+
python-multipart==0.0.6
|
| 5 |
+
xgboost==2.0.3
|
| 6 |
+
pandas==2.2.0
|
| 7 |
+
numpy==1.26.0
|
| 8 |
+
scikit-learn==1.4.0
|
symptom_checker.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import xgboost as xgb
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from sklearn.preprocessing import LabelEncoder
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def load_dataset(csv_path: str) -> pd.DataFrame:
|
| 13 |
+
if not os.path.exists(csv_path):
|
| 14 |
+
raise FileNotFoundError(
|
| 15 |
+
f"CSV not found at '{csv_path}'. Provide a valid path with --csv <path>."
|
| 16 |
+
)
|
| 17 |
+
data = pd.read_csv(csv_path)
|
| 18 |
+
if data.shape[1] < 2:
|
| 19 |
+
raise ValueError("Dataset must have at least 2 columns: target then feature columns.")
|
| 20 |
+
return data
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def train_model(data: pd.DataFrame):
|
| 24 |
+
y = data.iloc[:, 0]
|
| 25 |
+
|
| 26 |
+
# Remove diseases with only 1 record
|
| 27 |
+
value_counts = y.value_counts()
|
| 28 |
+
rare_diseases = value_counts[value_counts < 2].index
|
| 29 |
+
data_filtered = data[~data.iloc[:, 0].isin(rare_diseases)]
|
| 30 |
+
|
| 31 |
+
X = data_filtered.iloc[:, 1:]
|
| 32 |
+
y = data_filtered.iloc[:, 0]
|
| 33 |
+
|
| 34 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 35 |
+
X, y, test_size=0.2, random_state=42, stratify=y
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
label_encoder = LabelEncoder()
|
| 39 |
+
y_train_encoded = label_encoder.fit_transform(y_train)
|
| 40 |
+
y_test_encoded = label_encoder.transform(y_test)
|
| 41 |
+
|
| 42 |
+
# Prefer GPU if available, but fall back to CPU if not supported
|
| 43 |
+
common_kwargs = dict(
|
| 44 |
+
objective="multi:softprob",
|
| 45 |
+
num_class=len(np.unique(y_train_encoded)),
|
| 46 |
+
eval_metric="mlogloss",
|
| 47 |
+
tree_method="hist",
|
| 48 |
+
n_estimators=400,
|
| 49 |
+
max_depth=6,
|
| 50 |
+
learning_rate=0.05,
|
| 51 |
+
subsample=0.8,
|
| 52 |
+
colsample_bytree=0.8,
|
| 53 |
+
random_state=42,
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
model = xgb.XGBClassifier(device="cuda", **common_kwargs)
|
| 58 |
+
except TypeError:
|
| 59 |
+
# Older xgboost: no 'device' param. Try GPU via tree_method if supported, else CPU.
|
| 60 |
+
try:
|
| 61 |
+
model = xgb.XGBClassifier(tree_method="gpu_hist", **{k: v for k, v in common_kwargs.items() if k != "tree_method"})
|
| 62 |
+
except Exception:
|
| 63 |
+
model = xgb.XGBClassifier(**common_kwargs)
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
model.fit(
|
| 67 |
+
X_train,
|
| 68 |
+
y_train_encoded,
|
| 69 |
+
eval_set=[(X_test, y_test_encoded)],
|
| 70 |
+
verbose=50,
|
| 71 |
+
early_stopping_rounds=50,
|
| 72 |
+
)
|
| 73 |
+
except TypeError:
|
| 74 |
+
# Older xgboost versions do not support early_stopping_rounds in sklearn API
|
| 75 |
+
model.fit(
|
| 76 |
+
X_train,
|
| 77 |
+
y_train_encoded,
|
| 78 |
+
eval_set=[(X_test, y_test_encoded)],
|
| 79 |
+
verbose=50,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
return model, label_encoder, X.columns.tolist()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def save_artifacts(model: xgb.XGBClassifier, label_encoder: LabelEncoder, feature_names: List[str], prefix: str) -> Tuple[str, str, str]:
|
| 86 |
+
os.makedirs(os.path.dirname(prefix) or ".", exist_ok=True)
|
| 87 |
+
model_path = f"{prefix}.json"
|
| 88 |
+
labels_path = f"{prefix}.labels.npy"
|
| 89 |
+
features_path = f"{prefix}.features.txt"
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
model.save_model(model_path)
|
| 93 |
+
except Exception:
|
| 94 |
+
model.get_booster().save_model(model_path)
|
| 95 |
+
|
| 96 |
+
# Save label encoder classes with allow_pickle=True since they contain strings
|
| 97 |
+
np.save(labels_path, label_encoder.classes_, allow_pickle=True)
|
| 98 |
+
|
| 99 |
+
with open(features_path, "w", encoding="utf-8") as f:
|
| 100 |
+
for name in feature_names:
|
| 101 |
+
f.write(f"{name}\n")
|
| 102 |
+
|
| 103 |
+
return model_path, labels_path, features_path
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def load_artifacts(prefix: str) -> Tuple[xgb.XGBClassifier, LabelEncoder, List[str]]:
|
| 107 |
+
model_path = f"{prefix}.json"
|
| 108 |
+
labels_path = f"{prefix}.labels.npy"
|
| 109 |
+
features_path = f"{prefix}.features.txt"
|
| 110 |
+
|
| 111 |
+
if not (os.path.exists(model_path) and os.path.exists(labels_path) and os.path.exists(features_path)):
|
| 112 |
+
raise FileNotFoundError(
|
| 113 |
+
f"Missing artifacts. Expected: '{model_path}', '{labels_path}', '{features_path}'."
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
model = xgb.XGBClassifier()
|
| 117 |
+
model.load_model(model_path)
|
| 118 |
+
|
| 119 |
+
label_encoder = LabelEncoder()
|
| 120 |
+
# Load label encoder classes with allow_pickle=True since they contain strings
|
| 121 |
+
classes = np.load(labels_path, allow_pickle=True)
|
| 122 |
+
label_encoder.classes_ = classes
|
| 123 |
+
|
| 124 |
+
with open(features_path, "r", encoding="utf-8") as f:
|
| 125 |
+
feature_names = [line.strip() for line in f if line.strip()]
|
| 126 |
+
|
| 127 |
+
return model, label_encoder, feature_names
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def build_feature_vector(symptom_names: List[str], selected: List[str]) -> np.ndarray:
|
| 131 |
+
features = np.zeros(len(symptom_names), dtype=float)
|
| 132 |
+
name_to_index = {name.lower().strip(): idx for idx, name in enumerate(symptom_names)}
|
| 133 |
+
for s in selected:
|
| 134 |
+
key = s.lower().strip()
|
| 135 |
+
if key in name_to_index:
|
| 136 |
+
features[name_to_index[key]] = 1.0
|
| 137 |
+
return features.reshape(1, -1)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def interactive_loop(model, label_encoder, symptom_names: List[str]):
|
| 141 |
+
print("\n" + "=" * 60)
|
| 142 |
+
print("🩺 Symptom Checker (XGBoost)")
|
| 143 |
+
print("=" * 60)
|
| 144 |
+
print("Enter symptoms separated by commas. Example: fever, cough, headache")
|
| 145 |
+
print("Type 'list' to see all available symptoms, or 'quit' to exit.")
|
| 146 |
+
print("=" * 60)
|
| 147 |
+
|
| 148 |
+
while True:
|
| 149 |
+
try:
|
| 150 |
+
user = input("\n💬 Symptoms: ").strip()
|
| 151 |
+
if user.lower() in {"quit", "exit", "q", ""}:
|
| 152 |
+
print("👋 Goodbye!")
|
| 153 |
+
break
|
| 154 |
+
if user.lower() == "list":
|
| 155 |
+
print("\nAvailable symptoms (features):")
|
| 156 |
+
print(", ".join(symptom_names))
|
| 157 |
+
continue
|
| 158 |
+
|
| 159 |
+
selected = [s for s in user.split(",") if s.strip()]
|
| 160 |
+
if not selected:
|
| 161 |
+
print("⚠️ Please enter at least one symptom.")
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
x = build_feature_vector(symptom_names, selected)
|
| 165 |
+
proba = model.predict_proba(x)[0]
|
| 166 |
+
top3_idx = np.argsort(proba)[-3:][::-1]
|
| 167 |
+
top1 = top3_idx[0]
|
| 168 |
+
|
| 169 |
+
top1_label = label_encoder.inverse_transform([top1])[0]
|
| 170 |
+
top1_conf = proba[top1]
|
| 171 |
+
|
| 172 |
+
print("\n📊 Prediction Results")
|
| 173 |
+
print("-" * 60)
|
| 174 |
+
print(f"🏥 Primary Diagnosis: {top1_label}")
|
| 175 |
+
print(f"📈 Confidence: {top1_conf:.4f} ({top1_conf*100:.2f}%)")
|
| 176 |
+
print("\n🏆 Top 3 Possible Conditions:")
|
| 177 |
+
for rank, idx in enumerate(top3_idx, start=1):
|
| 178 |
+
label = label_encoder.inverse_transform([idx])[0]
|
| 179 |
+
print(f" {rank}. {label}: {proba[idx]:.4f} ({proba[idx]*100:.2f}%)")
|
| 180 |
+
|
| 181 |
+
except KeyboardInterrupt:
|
| 182 |
+
print("\n👋 Interrupted. Goodbye!")
|
| 183 |
+
break
|
| 184 |
+
except Exception as e:
|
| 185 |
+
print(f"❌ Error: {e}")
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def main():
|
| 189 |
+
parser = argparse.ArgumentParser(description="Symptom checker using an XGBoost classifier.")
|
| 190 |
+
parser.add_argument(
|
| 191 |
+
"--csv",
|
| 192 |
+
type=str,
|
| 193 |
+
required=False,
|
| 194 |
+
help="Path to CSV dataset. First column must be target (disease), remaining columns symptoms.",
|
| 195 |
+
)
|
| 196 |
+
parser.add_argument(
|
| 197 |
+
"--save-prefix",
|
| 198 |
+
type=str,
|
| 199 |
+
default=None,
|
| 200 |
+
help="Prefix to save artifacts (creates .json/.labels.npy/.features.txt)",
|
| 201 |
+
)
|
| 202 |
+
parser.add_argument(
|
| 203 |
+
"--eval-only",
|
| 204 |
+
action="store_true",
|
| 205 |
+
help="Evaluate previously saved artifacts on --csv and exit (no training).",
|
| 206 |
+
)
|
| 207 |
+
parser.add_argument(
|
| 208 |
+
"--artifacts-prefix",
|
| 209 |
+
type=str,
|
| 210 |
+
default="symptom_checker/symptom_model",
|
| 211 |
+
help="Prefix path to load artifacts (default: symptom_checker/symptom_model)",
|
| 212 |
+
)
|
| 213 |
+
parser.add_argument(
|
| 214 |
+
"--interactive-only",
|
| 215 |
+
action="store_true",
|
| 216 |
+
help="Start interactive mode using saved artifacts only (no training).",
|
| 217 |
+
)
|
| 218 |
+
args = parser.parse_args()
|
| 219 |
+
|
| 220 |
+
if args.interactive_only:
|
| 221 |
+
try:
|
| 222 |
+
model, label_encoder, feature_names = load_artifacts(args.artifacts_prefix)
|
| 223 |
+
except FileNotFoundError as e:
|
| 224 |
+
print(str(e))
|
| 225 |
+
print("Train and save first, e.g.:\n python symptom_checker/symtom_checker.py --csv cleaned_dataset.csv --save-prefix symptom_checker/symptom_model")
|
| 226 |
+
return
|
| 227 |
+
interactive_loop(model, label_encoder, feature_names)
|
| 228 |
+
return
|
| 229 |
+
|
| 230 |
+
if args.eval_only:
|
| 231 |
+
if not args.csv:
|
| 232 |
+
print("Provide CSV for evaluation. Example:\n python symptom_checker/symtom_checker.py --eval-only --csv cleaned_dataset.csv --artifacts-prefix symptom_checker/symptom_model")
|
| 233 |
+
return
|
| 234 |
+
data = load_dataset(args.csv)
|
| 235 |
+
try:
|
| 236 |
+
model, label_encoder, feature_names = load_artifacts(args.artifacts_prefix)
|
| 237 |
+
except FileNotFoundError as e:
|
| 238 |
+
print(str(e))
|
| 239 |
+
return
|
| 240 |
+
target_col = data.columns[0]
|
| 241 |
+
missing = [c for c in feature_names if c not in data.columns]
|
| 242 |
+
if missing:
|
| 243 |
+
print(f"CSV missing {len(missing)} feature columns from training. Example missing: {missing[:10]}")
|
| 244 |
+
return
|
| 245 |
+
X = data.loc[:, feature_names].fillna(0).values
|
| 246 |
+
y = data[target_col].values
|
| 247 |
+
y_enc = label_encoder.transform(y)
|
| 248 |
+
proba = model.predict_proba(X)
|
| 249 |
+
y_pred = np.argmax(proba, axis=1)
|
| 250 |
+
acc = (y_pred == y_enc).mean()
|
| 251 |
+
print(f"Accuracy on provided CSV: {acc:.4f} ({acc*100:.2f}%)")
|
| 252 |
+
return
|
| 253 |
+
|
| 254 |
+
if not args.csv:
|
| 255 |
+
print("❗ No CSV provided. Run: python symptom_checker/symtom_checker.py --csv path/to/dataset.csv")
|
| 256 |
+
return
|
| 257 |
+
|
| 258 |
+
data = load_dataset(args.csv)
|
| 259 |
+
print("Shape of dataset:", data.shape)
|
| 260 |
+
model, label_encoder, symptom_names = train_model(data)
|
| 261 |
+
|
| 262 |
+
if args.save_prefix:
|
| 263 |
+
print("Saving artifacts...")
|
| 264 |
+
paths = save_artifacts(model, label_encoder, symptom_names, args.save_prefix)
|
| 265 |
+
for p in paths:
|
| 266 |
+
print(f" - {p}")
|
| 267 |
+
|
| 268 |
+
interactive_loop(model, label_encoder, symptom_names)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
if __name__ == "__main__":
|
| 272 |
+
main()
|
| 273 |
+
|
symptom_model.features.txt
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
anxiety_and_nervousness
|
| 2 |
+
depression
|
| 3 |
+
shortness_of_breath
|
| 4 |
+
depressive_or_psychotic_symptoms
|
| 5 |
+
sharp_chest_pain
|
| 6 |
+
dizziness
|
| 7 |
+
insomnia
|
| 8 |
+
abnormal_involuntary_movements
|
| 9 |
+
chest_tightness
|
| 10 |
+
palpitations
|
| 11 |
+
irregular_heartbeat
|
| 12 |
+
breathing_fast
|
| 13 |
+
hoarse_voice
|
| 14 |
+
sore_throat
|
| 15 |
+
difficulty_speaking
|
| 16 |
+
cough
|
| 17 |
+
nasal_congestion
|
| 18 |
+
throat_swelling
|
| 19 |
+
diminished_hearing
|
| 20 |
+
lump_in_throat
|
| 21 |
+
throat_feels_tight
|
| 22 |
+
difficulty_in_swallowing
|
| 23 |
+
skin_swelling
|
| 24 |
+
retention_of_urine
|
| 25 |
+
groin_mass
|
| 26 |
+
leg_pain
|
| 27 |
+
hip_pain
|
| 28 |
+
suprapubic_pain
|
| 29 |
+
blood_in_stool
|
| 30 |
+
lack_of_growth
|
| 31 |
+
emotional_symptoms
|
| 32 |
+
elbow_weakness
|
| 33 |
+
back_weakness
|
| 34 |
+
symptoms_of_the_scrotum_and_testes
|
| 35 |
+
swelling_of_scrotum
|
| 36 |
+
pain_in_testicles
|
| 37 |
+
flatulence
|
| 38 |
+
pus_draining_from_ear
|
| 39 |
+
jaundice
|
| 40 |
+
mass_in_scrotum
|
| 41 |
+
white_discharge_from_eye
|
| 42 |
+
irritable_infant
|
| 43 |
+
abusing_alcohol
|
| 44 |
+
fainting
|
| 45 |
+
hostile_behavior
|
| 46 |
+
drug_abuse
|
| 47 |
+
sharp_abdominal_pain
|
| 48 |
+
feeling_ill
|
| 49 |
+
vomiting
|
| 50 |
+
headache
|
| 51 |
+
nausea
|
| 52 |
+
diarrhea
|
| 53 |
+
vaginal_itching
|
| 54 |
+
vaginal_dryness
|
| 55 |
+
painful_urination
|
| 56 |
+
involuntary_urination
|
| 57 |
+
pain_during_intercourse
|
| 58 |
+
frequent_urination
|
| 59 |
+
lower_abdominal_pain
|
| 60 |
+
vaginal_discharge
|
| 61 |
+
blood_in_urine
|
| 62 |
+
hot_flashes
|
| 63 |
+
intermenstrual_bleeding
|
| 64 |
+
hand_or_finger_pain
|
| 65 |
+
wrist_pain
|
| 66 |
+
hand_or_finger_swelling
|
| 67 |
+
arm_pain
|
| 68 |
+
wrist_swelling
|
| 69 |
+
arm_stiffness_or_tightness
|
| 70 |
+
arm_swelling
|
| 71 |
+
hand_or_finger_stiffness_or_tightness
|
| 72 |
+
wrist_stiffness_or_tightness
|
| 73 |
+
lip_swelling
|
| 74 |
+
toothache
|
| 75 |
+
abnormal_appearing_skin
|
| 76 |
+
skin_lesion
|
| 77 |
+
acne_or_pimples
|
| 78 |
+
dry_lips
|
| 79 |
+
facial_pain
|
| 80 |
+
mouth_ulcer
|
| 81 |
+
skin_growth
|
| 82 |
+
eye_deviation
|
| 83 |
+
diminished_vision
|
| 84 |
+
double_vision
|
| 85 |
+
cross-eyed
|
| 86 |
+
symptoms_of_eye
|
| 87 |
+
pain_in_eye
|
| 88 |
+
eye_moves_abnormally
|
| 89 |
+
abnormal_movement_of_eyelid
|
| 90 |
+
foreign_body_sensation_in_eye
|
| 91 |
+
irregular_appearing_scalp
|
| 92 |
+
swollen_lymph_nodes
|
| 93 |
+
back_pain
|
| 94 |
+
neck_pain
|
| 95 |
+
low_back_pain
|
| 96 |
+
pain_of_the_anus
|
| 97 |
+
pain_during_pregnancy
|
| 98 |
+
pelvic_pain
|
| 99 |
+
impotence
|
| 100 |
+
vomiting_blood
|
| 101 |
+
regurgitation
|
| 102 |
+
burning_abdominal_pain
|
| 103 |
+
restlessness
|
| 104 |
+
symptoms_of_infants
|
| 105 |
+
wheezing
|
| 106 |
+
peripheral_edema
|
| 107 |
+
neck_mass
|
| 108 |
+
ear_pain
|
| 109 |
+
jaw_swelling
|
| 110 |
+
mouth_dryness
|
| 111 |
+
neck_swelling
|
| 112 |
+
knee_pain
|
| 113 |
+
foot_or_toe_pain
|
| 114 |
+
ankle_pain
|
| 115 |
+
bones_are_painful
|
| 116 |
+
knee_weakness
|
| 117 |
+
elbow_pain
|
| 118 |
+
knee_swelling
|
| 119 |
+
skin_moles
|
| 120 |
+
knee_lump_or_mass
|
| 121 |
+
weight_gain
|
| 122 |
+
problems_with_movement
|
| 123 |
+
knee_stiffness_or_tightness
|
| 124 |
+
leg_swelling
|
| 125 |
+
foot_or_toe_swelling
|
| 126 |
+
heartburn
|
| 127 |
+
smoking_problems
|
| 128 |
+
muscle_pain
|
| 129 |
+
infant_feeding_problem
|
| 130 |
+
recent_weight_loss
|
| 131 |
+
difficulty_eating
|
| 132 |
+
vaginal_pain
|
| 133 |
+
vaginal_redness
|
| 134 |
+
vulvar_irritation
|
| 135 |
+
weakness
|
| 136 |
+
decreased_heart_rate
|
| 137 |
+
increased_heart_rate
|
| 138 |
+
bleeding_or_discharge_from_nipple
|
| 139 |
+
ringing_in_ear
|
| 140 |
+
plugged_feeling_in_ear
|
| 141 |
+
itchy_ear(s)
|
| 142 |
+
frontal_headache
|
| 143 |
+
fluid_in_ear
|
| 144 |
+
neck_stiffness_or_tightness
|
| 145 |
+
spots_or_clouds_in_vision
|
| 146 |
+
eye_redness
|
| 147 |
+
lacrimation
|
| 148 |
+
itchiness_of_eye
|
| 149 |
+
blindness
|
| 150 |
+
eye_burns_or_stings
|
| 151 |
+
itchy_eyelid
|
| 152 |
+
decreased_appetite
|
| 153 |
+
excessive_appetite
|
| 154 |
+
excessive_anger
|
| 155 |
+
loss_of_sensation
|
| 156 |
+
focal_weakness
|
| 157 |
+
slurring_words
|
| 158 |
+
symptoms_of_the_face
|
| 159 |
+
disturbance_of_memory
|
| 160 |
+
paresthesia
|
| 161 |
+
side_pain
|
| 162 |
+
fever
|
| 163 |
+
shoulder_pain
|
| 164 |
+
shoulder_stiffness_or_tightness
|
| 165 |
+
shoulder_weakness
|
| 166 |
+
shoulder_swelling
|
| 167 |
+
tongue_lesions
|
| 168 |
+
leg_cramps_or_spasms
|
| 169 |
+
ache_all_over
|
| 170 |
+
lower_body_pain
|
| 171 |
+
problems_during_pregnancy
|
| 172 |
+
spotting_or_bleeding_during_pregnancy
|
| 173 |
+
cramps_and_spasms
|
| 174 |
+
upper_abdominal_pain
|
| 175 |
+
stomach_bloating
|
| 176 |
+
changes_in_stool_appearance
|
| 177 |
+
unusual_color_or_odor_to_urine
|
| 178 |
+
kidney_mass
|
| 179 |
+
swollen_abdomen
|
| 180 |
+
symptoms_of_prostate
|
| 181 |
+
leg_stiffness_or_tightness
|
| 182 |
+
difficulty_breathing
|
| 183 |
+
rib_pain
|
| 184 |
+
joint_pain
|
| 185 |
+
muscle_stiffness_or_tightness
|
| 186 |
+
hand_or_finger_lump_or_mass
|
| 187 |
+
chills
|
| 188 |
+
groin_pain
|
| 189 |
+
fatigue
|
| 190 |
+
abdominal_distention
|
| 191 |
+
regurgitation.1
|
| 192 |
+
symptoms_of_the_kidneys
|
| 193 |
+
melena
|
| 194 |
+
coughing_up_sputum
|
| 195 |
+
seizures
|
| 196 |
+
delusions_or_hallucinations
|
| 197 |
+
pain_or_soreness_of_breast
|
| 198 |
+
excessive_urination_at_night
|
| 199 |
+
bleeding_from_eye
|
| 200 |
+
rectal_bleeding
|
| 201 |
+
constipation
|
| 202 |
+
temper_problems
|
| 203 |
+
coryza
|
| 204 |
+
hemoptysis
|
| 205 |
+
lymphedema
|
| 206 |
+
skin_on_leg_or_foot_looks_infected
|
| 207 |
+
allergic_reaction
|
| 208 |
+
congestion_in_chest
|
| 209 |
+
muscle_swelling
|
| 210 |
+
sleepiness
|
| 211 |
+
apnea
|
| 212 |
+
abnormal_breathing_sounds
|
| 213 |
+
blood_clots_during_menstrual_periods
|
| 214 |
+
absence_of_menstruation
|
| 215 |
+
pulling_at_ears
|
| 216 |
+
gum_pain
|
| 217 |
+
redness_in_ear
|
| 218 |
+
fluid_retention
|
| 219 |
+
flu-like_syndrome
|
| 220 |
+
sinus_congestion
|
| 221 |
+
painful_sinuses
|
| 222 |
+
fears_and_phobias
|
| 223 |
+
recent_pregnancy
|
| 224 |
+
uterine_contractions
|
| 225 |
+
burning_chest_pain
|
| 226 |
+
back_cramps_or_spasms
|
| 227 |
+
stiffness_all_over
|
| 228 |
+
muscle_cramps,_contractures,_or_spasms
|
| 229 |
+
back_mass_or_lump
|
| 230 |
+
nosebleed
|
| 231 |
+
long_menstrual_periods
|
| 232 |
+
heavy_menstrual_flow
|
| 233 |
+
unpredictable_menstruation
|
| 234 |
+
painful_menstruation
|
| 235 |
+
infertility
|
| 236 |
+
frequent_menstruation
|
| 237 |
+
sweating
|
| 238 |
+
mass_on_eyelid
|
| 239 |
+
swollen_eye
|
| 240 |
+
eyelid_swelling
|
| 241 |
+
eyelid_lesion_or_rash
|
| 242 |
+
symptoms_of_bladder
|
| 243 |
+
irregular_appearing_nails
|
| 244 |
+
itching_of_skin
|
| 245 |
+
hurts_to_breath
|
| 246 |
+
skin_dryness,_peeling,_scaliness,_or_roughness
|
| 247 |
+
skin_on_arm_or_hand_looks_infected
|
| 248 |
+
skin_irritation
|
| 249 |
+
itchy_scalp
|
| 250 |
+
warts
|
| 251 |
+
bumps_on_penis
|
| 252 |
+
too_little_hair
|
| 253 |
+
skin_rash
|
| 254 |
+
mass_or_swelling_around_the_anus
|
| 255 |
+
ankle_swelling
|
| 256 |
+
dry_or_flaky_scalp
|
| 257 |
+
foot_or_toe_stiffness_or_tightness
|
| 258 |
+
elbow_swelling
|
| 259 |
+
early_or_late_onset_of_menopause
|
| 260 |
+
bleeding_from_ear
|
| 261 |
+
hand_or_finger_weakness
|
| 262 |
+
low_self-esteem
|
| 263 |
+
itching_of_the_anus
|
| 264 |
+
swollen_or_red_tonsils
|
| 265 |
+
irregular_belly_button
|
| 266 |
+
hip_stiffness_or_tightness
|
| 267 |
+
mouth_pain
|
| 268 |
+
arm_weakness
|
| 269 |
+
penis_pain
|
| 270 |
+
loss_of_sex_drive
|
| 271 |
+
obsessions_and_compulsions
|
| 272 |
+
antisocial_behavior
|
| 273 |
+
neck_cramps_or_spasms
|
| 274 |
+
sneezing
|
| 275 |
+
leg_weakness
|
| 276 |
+
penis_redness
|
| 277 |
+
penile_discharge
|
| 278 |
+
shoulder_lump_or_mass
|
| 279 |
+
cloudy_eye
|
| 280 |
+
hysterical_behavior
|
| 281 |
+
arm_lump_or_mass
|
| 282 |
+
nightmares
|
| 283 |
+
bleeding_gums
|
| 284 |
+
pain_in_gums
|
| 285 |
+
bedwetting
|
| 286 |
+
diaper_rash
|
| 287 |
+
lump_or_mass_of_breast
|
| 288 |
+
postpartum_problems_of_the_breast
|
| 289 |
+
hesitancy
|
| 290 |
+
throat_redness
|
| 291 |
+
joint_swelling
|
| 292 |
+
redness_in_or_around_nose
|
| 293 |
+
wrinkles_on_skin
|
| 294 |
+
back_stiffness_or_tightness
|
| 295 |
+
wrist_lump_or_mass
|
| 296 |
+
low_urine_output
|
| 297 |
+
sore_in_nose
|
symptom_model.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae1e0d191d55db28ae79157690febcd9121cf911e6bacc5dedba1cd30dbdc572
|
| 3 |
+
size 615592529
|
symptom_model.labels.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da84a97a05c084170813701d487044da5f40a019a81fb4896094042489657ac0
|
| 3 |
+
size 910
|