Spaces:
Sleeping
Sleeping
Initial Commit
Browse filesUploading model and app files
- App.py +96 -0
- requirements.txt +6 -0
- server_failure_model.pkl +3 -0
App.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import joblib
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# --- 1. Load the Model ---
|
| 8 |
+
MODEL_FILE = "server_failure_model.pkl"
|
| 9 |
+
|
| 10 |
+
def load_model():
|
| 11 |
+
if os.path.exists(MODEL_FILE):
|
| 12 |
+
try:
|
| 13 |
+
return joblib.load(MODEL_FILE)
|
| 14 |
+
except Exception:
|
| 15 |
+
import pickle
|
| 16 |
+
with open(MODEL_FILE, 'rb') as f:
|
| 17 |
+
return pickle.load(f)
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
model = load_model()
|
| 21 |
+
|
| 22 |
+
# --- 2. The Prediction Function ---
|
| 23 |
+
def predict_failure(cpu_current, cpu_sustained, ram_current, temp_current, temp_change):
|
| 24 |
+
if model is None:
|
| 25 |
+
return "Error: Model not found. Please upload .pkl file.", 0.0
|
| 26 |
+
|
| 27 |
+
# Construct the 9 features exactly as the model expects
|
| 28 |
+
# 1. cpu_percent
|
| 29 |
+
# 2. ram_percent
|
| 30 |
+
# 3. cpu_temp
|
| 31 |
+
# 4. gpu_temp (Inferred)
|
| 32 |
+
# 5. net_recv_bytes (Noise)
|
| 33 |
+
# 6. disk_write_bytes (Noise)
|
| 34 |
+
# 7. cpu_rolling_avg
|
| 35 |
+
# 8. ram_rolling_avg (Inferred)
|
| 36 |
+
# 9. cpu_temp_change
|
| 37 |
+
|
| 38 |
+
input_df = pd.DataFrame([{
|
| 39 |
+
'cpu_percent': float(cpu_current),
|
| 40 |
+
'ram_percent': float(ram_current),
|
| 41 |
+
'cpu_temp': float(temp_current),
|
| 42 |
+
'gpu_temp': float(temp_current) - 15.0, # Heuristic
|
| 43 |
+
'net_recv_bytes': 1024.0,
|
| 44 |
+
'disk_write_bytes': 0.0,
|
| 45 |
+
'cpu_rolling_avg': float(cpu_sustained),
|
| 46 |
+
'ram_rolling_avg': float(ram_current),
|
| 47 |
+
'cpu_temp_change': float(temp_change)
|
| 48 |
+
}])
|
| 49 |
+
|
| 50 |
+
# Get Prediction
|
| 51 |
+
pred_class = model.predict(input_df)[0]
|
| 52 |
+
pred_prob = model.predict_proba(input_df)[0][1]
|
| 53 |
+
|
| 54 |
+
# Return user-friendly outputs
|
| 55 |
+
status = "CRITICAL FAILURE IMMINENT" if pred_class == 1 else "SYSTEM NORMAL"
|
| 56 |
+
probability = f"{pred_prob * 100:.1f}%"
|
| 57 |
+
|
| 58 |
+
return status, probability
|
| 59 |
+
|
| 60 |
+
# --- 3. The Gradio UI Interface ---
|
| 61 |
+
# This replaces your HTML file. Gradio builds the UI for you.
|
| 62 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 63 |
+
gr.Markdown("# 🖥️ Server Health Sentinel AI")
|
| 64 |
+
gr.Markdown("### AIOps Failure Prediction System (PoC)")
|
| 65 |
+
gr.Markdown("This AI predicts if a server will overheat (>80°C) in the next 5 minutes based on telemetry trends.")
|
| 66 |
+
|
| 67 |
+
with gr.Row():
|
| 68 |
+
with gr.Column():
|
| 69 |
+
gr.Markdown("### 🎛️ Live Telemetry Simulation")
|
| 70 |
+
|
| 71 |
+
# The 5 Sliders
|
| 72 |
+
s_cpu = gr.Slider(0, 100, value=10, label="Current CPU Load (%)")
|
| 73 |
+
s_cpu_avg = gr.Slider(0, 100, value=10, label="Sustained CPU Load (Last 1 min avg) (%)")
|
| 74 |
+
s_ram = gr.Slider(0, 100, value=30, label="RAM Usage (%)")
|
| 75 |
+
s_temp = gr.Slider(30, 100, value=50, label="Current Temperature (°C)")
|
| 76 |
+
s_change = gr.Slider(-2, 5, value=0, step=0.5, label="Temp Change Rate (°C/sec)")
|
| 77 |
+
|
| 78 |
+
btn = gr.Button("Run Prediction", variant="primary")
|
| 79 |
+
|
| 80 |
+
with gr.Column():
|
| 81 |
+
gr.Markdown("### 🧠 AI Diagnosis")
|
| 82 |
+
out_status = gr.Textbox(label="Status")
|
| 83 |
+
out_prob = gr.Textbox(label="Failure Probability")
|
| 84 |
+
|
| 85 |
+
gr.Markdown("""
|
| 86 |
+
**Architecture:** Random forest Classifier
|
| 87 |
+
**Trained on:** 10,000+ Real-world Linux Telemetry Points
|
| 88 |
+
""")
|
| 89 |
+
|
| 90 |
+
# Connect the button to the function
|
| 91 |
+
btn.click(fn=predict_failure,
|
| 92 |
+
inputs=[s_cpu, s_cpu_avg, s_ram, s_temp, s_change],
|
| 93 |
+
outputs=[out_status, out_prob])
|
| 94 |
+
|
| 95 |
+
# Launch
|
| 96 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
scikit-learn
|
| 5 |
+
gunicorn
|
| 6 |
+
gradio
|
server_failure_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c6876a5cc41dfc0d4ed24cfdf26bf133f610a9ea1ef7166d4a0f9c27e77ffe4
|
| 3 |
+
size 1463449
|