Spaces:

Tantawi
/

Text_classification

Sleeping

File size: 5,347 Bytes

f2a4578

import argparse
import json
import os
from typing import List, Dict, Any

import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


def load_artifacts(prefix: str):
    """Load the trained model artifacts."""
    model_path = f"{prefix}.json"
    labels_path = f"{prefix}.labels.npy"
    features_path = f"{prefix}.features.txt"

    if not (os.path.exists(model_path) and os.path.exists(labels_path) and os.path.exists(features_path)):
        raise FileNotFoundError(f"Missing artifacts. Expected: {model_path}, {labels_path}, {features_path}")

    model = xgb.XGBClassifier()
    model.load_model(model_path)

    label_encoder = LabelEncoder()
    classes = np.load(labels_path, allow_pickle=True)
    label_encoder.classes_ = classes

    with open(features_path, "r", encoding="utf-8") as f:
        feature_names = [line.strip() for line in f if line.strip()]

    return model, label_encoder, feature_names


def build_feature_vector(symptom_names: List[str], selected_symptoms: List[str]) -> np.ndarray:
    """Convert symptom list to feature vector."""
    features = np.zeros(len(symptom_names), dtype=float)
    name_to_index = {name.lower().strip(): idx for idx, name in enumerate(symptom_names)}
    
    for symptom in selected_symptoms:
        key = symptom.lower().strip()
        if key in name_to_index:
            features[name_to_index[key]] = 1.0
    
    return features.reshape(1, -1)


def predict_symptoms_json(symptoms: List[str], model, label_encoder, feature_names: List[str]) -> Dict[str, Any]:
    """Return predictions in JSON format for API integration."""
    if not symptoms:
        return {"error": "No symptoms provided"}
    
    # Build feature vector
    x = build_feature_vector(feature_names, symptoms)
    
    # Get predictions
    proba = model.predict_proba(x)[0]
    top3_idx = np.argsort(proba)[-3:][::-1]
    
    # Format results
    predictions = []
    for rank, idx in enumerate(top3_idx, 1):
        disease_name = label_encoder.inverse_transform([idx])[0]
        confidence = float(proba[idx])
        predictions.append({
            "rank": rank,
            "disease": disease_name,
            "confidence": confidence,
            "confidence_percent": round(confidence * 100, 2)
        })
    
    return {
        "input_symptoms": symptoms,
        "primary_diagnosis": predictions[0],
        "top_predictions": predictions,
        "model_confidence": "high" if predictions[0]["confidence"] > 0.7 else "medium" if predictions[0]["confidence"] > 0.4 else "low"
    }


def predict_symptoms_csv(symptoms: List[str], model, label_encoder, feature_names: List[str]) -> str:
    """Return predictions in CSV format."""
    if not symptoms:
        return "error,No symptoms provided"
    
    x = build_feature_vector(feature_names, symptoms)
    proba = model.predict_proba(x)[0]
    top3_idx = np.argsort(proba)[-3:][::-1]
    
    csv_lines = ["rank,disease,confidence,confidence_percent"]
    for rank, idx in enumerate(top3_idx, 1):
        disease_name = label_encoder.inverse_transform([idx])[0]
        confidence = proba[idx]
        csv_lines.append(f"{rank},{disease_name},{confidence:.4f},{confidence*100:.2f}")
    
    return "\n".join(csv_lines)


def predict_symptoms_simple(symptoms: List[str], model, label_encoder, feature_names: List[str]) -> str:
    """Return simple text format."""
    if not symptoms:
        return "Error: No symptoms provided"
    
    x = build_feature_vector(feature_names, symptoms)
    proba = model.predict_proba(x)[0]
    top1_idx = np.argmax(proba)
    
    disease_name = label_encoder.inverse_transform([top1_idx])[0]
    confidence = proba[top1_idx]
    
    return f"Diagnosis: {disease_name} (Confidence: {confidence*100:.1f}%)"


def main():
    parser = argparse.ArgumentParser(description="API-style symptom checker using saved model")
    parser.add_argument("--symptoms", nargs="+", required=True, help="List of symptoms")
    parser.add_argument("--format", choices=["json", "csv", "simple"], default="json", help="Output format")
    parser.add_argument("--artifacts-prefix", default="symptom_checker/symptom_model", help="Path to model artifacts")
    args = parser.parse_args()

    try:
        # Load the trained model
        model, label_encoder, feature_names = load_artifacts(args.artifacts_prefix)
        
        # Get predictions in requested format
        if args.format == "json":
            result = predict_symptoms_json(args.symptoms, model, label_encoder, feature_names)
            print(json.dumps(result, indent=2))
        elif args.format == "csv":
            result = predict_symptoms_csv(args.symptoms, model, label_encoder, feature_names)
            print(result)
        elif args.format == "simple":
            result = predict_symptoms_simple(args.symptoms, model, label_encoder, feature_names)
            print(result)
            
    except Exception as e:
        error_result = {"error": str(e), "input_symptoms": args.symptoms}
        if args.format == "json":
            print(json.dumps(error_result, indent=2))
        else:
            print(f"Error: {e}")


if __name__ == "__main__":
    main()