File size: 2,286 Bytes
3ef5978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

# ----------------------------
# 1. Define the symptom keywords you care about (fixed set)
# ----------------------------
SYMPTOM_KEYWORDS = [
    "cough",
    "shortness of breath",
    "wheezing",
    "chest pain",
    "fever",
    "sore throat",
    "fatigue",
    "nasal congestion"
]

# ----------------------------
# 2. Load your CSV (labels come ONLY from here)
# ----------------------------
CSV_FILE = "filtered_dataset.csv"   # ← your actual file
TEXT_COL = "Symptoms"               # ← column with symptom descriptions
LABEL_COL = "Disease"               # ← column with disease names

df = pd.read_csv(CSV_FILE)

# Optional: Drop rows with missing symptoms or labels
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy()
df[TEXT_COL] = df[TEXT_COL].astype(str)

print(f"Loaded {len(df)} rows from CSV.")
print(f"Unique diseases found: {sorted(df[LABEL_COL].unique())}")

# ----------------------------
# 3. Convert free-text → binary symptom vector
# ----------------------------
def symptoms_to_binary_vector(text: str):
    text = text.lower()
    return [1 if keyword in text else 0 for keyword in SYMPTOM_KEYWORDS]

# Apply to every row
X = df[TEXT_COL].apply(symptoms_to_binary_vector).tolist()
y = df[LABEL_COL].values  # labels directly from CSV

# ----------------------------
# 4. Encode labels (if not already numeric)
# ----------------------------
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# ----------------------------
# 5. Train model
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ----------------------------
# 6. Save everything for your app
# ----------------------------
joblib.dump(model, "disease_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(SYMPTOM_KEYWORDS, "symptom_keywords.pkl")

print("\n✅ Training complete!")
print("Saved: disease_model.pkl, label_encoder.pkl, symptom_keywords.pkl")