Spaces:
Runtime error
Runtime error
File size: 2,286 Bytes
3ef5978 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib
# ----------------------------
# 1. Define the symptom keywords you care about (fixed set)
# ----------------------------
SYMPTOM_KEYWORDS = [
"cough",
"shortness of breath",
"wheezing",
"chest pain",
"fever",
"sore throat",
"fatigue",
"nasal congestion"
]
# ----------------------------
# 2. Load your CSV (labels come ONLY from here)
# ----------------------------
CSV_FILE = "filtered_dataset.csv" # ← your actual file
TEXT_COL = "Symptoms" # ← column with symptom descriptions
LABEL_COL = "Disease" # ← column with disease names
df = pd.read_csv(CSV_FILE)
# Optional: Drop rows with missing symptoms or labels
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy()
df[TEXT_COL] = df[TEXT_COL].astype(str)
print(f"Loaded {len(df)} rows from CSV.")
print(f"Unique diseases found: {sorted(df[LABEL_COL].unique())}")
# ----------------------------
# 3. Convert free-text → binary symptom vector
# ----------------------------
def symptoms_to_binary_vector(text: str):
text = text.lower()
return [1 if keyword in text else 0 for keyword in SYMPTOM_KEYWORDS]
# Apply to every row
X = df[TEXT_COL].apply(symptoms_to_binary_vector).tolist()
y = df[LABEL_COL].values # labels directly from CSV
# ----------------------------
# 4. Encode labels (if not already numeric)
# ----------------------------
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# ----------------------------
# 5. Train model
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# ----------------------------
# 6. Save everything for your app
# ----------------------------
joblib.dump(model, "disease_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(SYMPTOM_KEYWORDS, "symptom_keywords.pkl")
print("\n✅ Training complete!")
print("Saved: disease_model.pkl, label_encoder.pkl, symptom_keywords.pkl") |