File size: 3,278 Bytes
08cf9a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8726b74
08cf9a9
 
 
 
 
8726b74
08cf9a9
 
 
 
 
 
 
 
 
 
 
8726b74
08cf9a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
import joblib
import pandas as pd
import numpy as np
import os

# --- 1. Load the Model ---
MODEL_FILE = "server_failure_model.pkl"

def load_model():
    if os.path.exists(MODEL_FILE):
        try:
            return joblib.load(MODEL_FILE)
        except Exception:
            import pickle
            with open(MODEL_FILE, 'rb') as f:
                return pickle.load(f)
    return None

model = load_model()

# --- 2. The Prediction Function ---
def predict_failure(cpu_current, cpu_sustained, ram_current, temp_current, temp_change):
    if model is None:
        return "Error: Model not found. Please upload .pkl file.", 0.0

    # Construct the 9 features exactly as the model expects
    # 1. cpu_percent
    # 2. ram_percent
    # 3. cpu_temp
    # 4. gpu_temp (Inferred)
    # 5. net_recv_bytes (Noise)
    # 6. disk_write_bytes (Noise)
    # 7. cpu_rolling_avg
    # 8. ram_rolling_avg (Inferred)
    # 9. cpu_temp_change

    input_df = pd.DataFrame([{
        'cpu_percent': float(cpu_current),
        'ram_percent': float(ram_current),
        'cpu_temp': float(temp_current),
        'gpu_temp': float(temp_current) - 15.0, # Heuristic
        'net_recv_bytes': 1024.0,
        'disk_write_bytes': 0.0,
        'cpu_rolling_avg': float(cpu_sustained),
        'ram_rolling_avg': float(ram_current),
        'cpu_temp_change': float(temp_change)
    }])

    # Get Prediction
    pred_class = model.predict(input_df)[0]
    pred_prob = model.predict_proba(input_df)[0][1]

    # Return user-friendly outputs
    status = "CRITICAL FAILURE IMMINENT" if pred_class == 1 else "SYSTEM NORMAL"
    probability = f"{pred_prob * 100:.1f}%"

    return status, probability

# --- 3. The Gradio UI Interface ---
# This replaces your HTML file. Gradio builds the UI for you.
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Server Health Sentinel AI")
    gr.Markdown("### AIOps Failure Prediction System (PoC)")
    gr.Markdown("This AI predicts if a server will overheat (>80°C) in the next 5 minutes based on telemetry trends.")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Live Telemetry Simulation")

            # The 5 Sliders
            s_cpu = gr.Slider(0, 100, value=10, label="Current CPU Load (%)")
            s_cpu_avg = gr.Slider(0, 100, value=10, label="Sustained CPU Load (Last 1 min avg) (%)")
            s_ram = gr.Slider(0, 100, value=30, label="RAM Usage (%)")
            s_temp = gr.Slider(30, 100, value=50, label="Current Temperature (°C)")
            s_change = gr.Slider(-2, 5, value=0, step=0.5, label="Temp Change Rate (°C/sec)")

            btn = gr.Button("Run Prediction", variant="primary")

        with gr.Column():
            gr.Markdown("### AI Diagnosis")
            out_status = gr.Textbox(label="Status")
            out_prob = gr.Textbox(label="Failure Probability")

            gr.Markdown("""
            **Architecture:** Random forest Classifier
            **Trained on:** 10,000+ Real-world Linux Telemetry Points
            """)

    # Connect the button to the function
    btn.click(fn=predict_failure,
              inputs=[s_cpu, s_cpu_avg, s_ram, s_temp, s_change],
              outputs=[out_status, out_prob])

# Launch
demo.launch()