diabete-risk-predictor / backend /utils /preprocessing.py
theamazingruby's picture
Initial commit with Docker and HF Spaces setup
e08b48d
from __future__ import annotations
from typing import Any
from ..schemas.requests import ClinicalData
def build_feature_vector(data: ClinicalData) -> list[float]:
return [
float(data.pregnancies),
float(data.glucose),
float(data.blood_pressure),
float(data.skin_thickness),
float(data.insulin),
float(data.bmi),
float(data.diabetes_pedigree_function),
float(data.age),
]
def engineer_features(data: ClinicalData) -> dict[str, float | str]:
# Clip ratio to handle insulin=0 (common missing-data placeholder in Pima dataset).
# Without clipping, ratio=148 when insulin=0 causes extreme z-score and collapses model output.
glucose_insulin_index = min(data.glucose / (data.insulin + 1.0), 10.0)
bmi_category = "normal"
if data.bmi >= 30:
bmi_category = "obese"
elif data.bmi >= 25:
bmi_category = "overweight"
if data.age < 35:
age_group = "young"
elif data.age < 55:
age_group = "adult"
else:
age_group = "senior"
glucose_bmi_index = (data.glucose / 100.0) * (data.bmi / 30.0)
age_risk_index = data.age / 50.0
family_risk_index = data.diabetes_pedigree_function * 1.5
return {
"glucose_insulin_index": round(glucose_insulin_index, 4),
"glucose_bmi_index": round(glucose_bmi_index, 4),
"age_risk_index": round(age_risk_index, 4),
"family_risk_index": round(family_risk_index, 4),
"bmi_category": bmi_category,
"age_group": age_group,
}
def risk_level_from_probability(probability: float) -> str:
if probability >= 0.7:
return "high"
if probability >= 0.4:
return "moderate"
return "low"
def normalize_scores(scores: dict[str, float]) -> dict[str, float]:
total = sum(max(score, 0.0) for score in scores.values())
if total <= 0:
count = max(len(scores), 1)
return {label: 1.0 / count for label in scores}
return {label: max(score, 0.0) / total for label, score in scores.items()}
def symptoms_match_diabetes(text: str, top_label: str) -> bool:
lowered = text.lower()
diabetes_keywords = ["thirst", "urinate", "urination", "polyuria", "fatigue", "tired", "blurred", "vision", "weight loss"]
if top_label.lower().startswith("diabetes"):
return any(keyword in lowered for keyword in diabetes_keywords)
return False
def build_llm_context(clinical_data: ClinicalData, risk_probability: float, symptoms_text: str) -> dict[str, Any]:
return {
"clinical_data": {
"pregnancies": clinical_data.pregnancies,
"glucose": clinical_data.glucose,
"blood_pressure": clinical_data.blood_pressure,
"skin_thickness": clinical_data.skin_thickness,
"insulin": clinical_data.insulin,
"bmi": clinical_data.bmi,
"diabetes_pedigree_function": clinical_data.diabetes_pedigree_function,
"age": clinical_data.age,
},
"risk_probability": round(risk_probability, 4),
"risk_level": risk_level_from_probability(risk_probability),
"symptoms_text": symptoms_text,
}