File size: 3,183 Bytes
e08b48d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from __future__ import annotations

from typing import Any

from ..schemas.requests import ClinicalData


def build_feature_vector(data: ClinicalData) -> list[float]:
    return [
        float(data.pregnancies),
        float(data.glucose),
        float(data.blood_pressure),
        float(data.skin_thickness),
        float(data.insulin),
        float(data.bmi),
        float(data.diabetes_pedigree_function),
        float(data.age),
    ]


def engineer_features(data: ClinicalData) -> dict[str, float | str]:
    # Clip ratio to handle insulin=0 (common missing-data placeholder in Pima dataset).
    # Without clipping, ratio=148 when insulin=0 causes extreme z-score and collapses model output.
    glucose_insulin_index = min(data.glucose / (data.insulin + 1.0), 10.0)
    bmi_category = "normal"
    if data.bmi >= 30:
        bmi_category = "obese"
    elif data.bmi >= 25:
        bmi_category = "overweight"

    if data.age < 35:
        age_group = "young"
    elif data.age < 55:
        age_group = "adult"
    else:
        age_group = "senior"

    glucose_bmi_index = (data.glucose / 100.0) * (data.bmi / 30.0)
    age_risk_index = data.age / 50.0
    family_risk_index = data.diabetes_pedigree_function * 1.5

    return {
        "glucose_insulin_index": round(glucose_insulin_index, 4),
        "glucose_bmi_index": round(glucose_bmi_index, 4),
        "age_risk_index": round(age_risk_index, 4),
        "family_risk_index": round(family_risk_index, 4),
        "bmi_category": bmi_category,
        "age_group": age_group,
    }


def risk_level_from_probability(probability: float) -> str:
    if probability >= 0.7:
        return "high"
    if probability >= 0.4:
        return "moderate"
    return "low"


def normalize_scores(scores: dict[str, float]) -> dict[str, float]:
    total = sum(max(score, 0.0) for score in scores.values())
    if total <= 0:
        count = max(len(scores), 1)
        return {label: 1.0 / count for label in scores}
    return {label: max(score, 0.0) / total for label, score in scores.items()}


def symptoms_match_diabetes(text: str, top_label: str) -> bool:
    lowered = text.lower()
    diabetes_keywords = ["thirst", "urinate", "urination", "polyuria", "fatigue", "tired", "blurred", "vision", "weight loss"]
    if top_label.lower().startswith("diabetes"):
        return any(keyword in lowered for keyword in diabetes_keywords)
    return False


def build_llm_context(clinical_data: ClinicalData, risk_probability: float, symptoms_text: str) -> dict[str, Any]:
    return {
        "clinical_data": {
            "pregnancies": clinical_data.pregnancies,
            "glucose": clinical_data.glucose,
            "blood_pressure": clinical_data.blood_pressure,
            "skin_thickness": clinical_data.skin_thickness,
            "insulin": clinical_data.insulin,
            "bmi": clinical_data.bmi,
            "diabetes_pedigree_function": clinical_data.diabetes_pedigree_function,
            "age": clinical_data.age,
        },
        "risk_probability": round(risk_probability, 4),
        "risk_level": risk_level_from_probability(risk_probability),
        "symptoms_text": symptoms_text,
    }