Resp_text / Model_Train.py
Solomon17705's picture
Added Files
3ef5978
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib
# ----------------------------
# 1. Define the symptom keywords you care about (fixed set)
# ----------------------------
SYMPTOM_KEYWORDS = [
"cough",
"shortness of breath",
"wheezing",
"chest pain",
"fever",
"sore throat",
"fatigue",
"nasal congestion"
]
# ----------------------------
# 2. Load your CSV (labels come ONLY from here)
# ----------------------------
CSV_FILE = "filtered_dataset.csv" # ← your actual file
TEXT_COL = "Symptoms" # ← column with symptom descriptions
LABEL_COL = "Disease" # ← column with disease names
df = pd.read_csv(CSV_FILE)
# Optional: Drop rows with missing symptoms or labels
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy()
df[TEXT_COL] = df[TEXT_COL].astype(str)
print(f"Loaded {len(df)} rows from CSV.")
print(f"Unique diseases found: {sorted(df[LABEL_COL].unique())}")
# ----------------------------
# 3. Convert free-text β†’ binary symptom vector
# ----------------------------
def symptoms_to_binary_vector(text: str):
text = text.lower()
return [1 if keyword in text else 0 for keyword in SYMPTOM_KEYWORDS]
# Apply to every row
X = df[TEXT_COL].apply(symptoms_to_binary_vector).tolist()
y = df[LABEL_COL].values # labels directly from CSV
# ----------------------------
# 4. Encode labels (if not already numeric)
# ----------------------------
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# ----------------------------
# 5. Train model
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# ----------------------------
# 6. Save everything for your app
# ----------------------------
joblib.dump(model, "disease_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(SYMPTOM_KEYWORDS, "symptom_keywords.pkl")
print("\nβœ… Training complete!")
print("Saved: disease_model.pkl, label_encoder.pkl, symptom_keywords.pkl")