Spaces:

heramb04
/

server_failure_predictor

Sleeping

App Files Files Community

server_failure_predictor / app.py

heramb04

Update app.py

8726b74 verified 2 months ago

raw

history blame contribute delete

3.28 kB

	import gradio as gr
	import joblib
	import pandas as pd
	import numpy as np
	import os

	# --- 1. Load the Model ---
	MODEL_FILE = "server_failure_model.pkl"

	def load_model():
	if os.path.exists(MODEL_FILE):
	try:
	return joblib.load(MODEL_FILE)
	except Exception:
	import pickle
	with open(MODEL_FILE, 'rb') as f:
	return pickle.load(f)
	return None

	model = load_model()

	# --- 2. The Prediction Function ---
	def predict_failure(cpu_current, cpu_sustained, ram_current, temp_current, temp_change):
	if model is None:
	return "Error: Model not found. Please upload .pkl file.", 0.0

	# Construct the 9 features exactly as the model expects
	# 1. cpu_percent
	# 2. ram_percent
	# 3. cpu_temp
	# 4. gpu_temp (Inferred)
	# 5. net_recv_bytes (Noise)
	# 6. disk_write_bytes (Noise)
	# 7. cpu_rolling_avg
	# 8. ram_rolling_avg (Inferred)
	# 9. cpu_temp_change

	input_df = pd.DataFrame([{
	'cpu_percent': float(cpu_current),
	'ram_percent': float(ram_current),
	'cpu_temp': float(temp_current),
	'gpu_temp': float(temp_current) - 15.0, # Heuristic
	'net_recv_bytes': 1024.0,
	'disk_write_bytes': 0.0,
	'cpu_rolling_avg': float(cpu_sustained),
	'ram_rolling_avg': float(ram_current),
	'cpu_temp_change': float(temp_change)
	}])

	# Get Prediction
	pred_class = model.predict(input_df)[0]
	pred_prob = model.predict_proba(input_df)[0][1]

	# Return user-friendly outputs
	status = "CRITICAL FAILURE IMMINENT" if pred_class == 1 else "SYSTEM NORMAL"
	probability = f"{pred_prob * 100:.1f}%"

	return status, probability

	# --- 3. The Gradio UI Interface ---
	# This replaces your HTML file. Gradio builds the UI for you.
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Server Health Sentinel AI")
	gr.Markdown("### AIOps Failure Prediction System (PoC)")
	gr.Markdown("This AI predicts if a server will overheat (>80°C) in the next 5 minutes based on telemetry trends.")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Live Telemetry Simulation")

	# The 5 Sliders
	s_cpu = gr.Slider(0, 100, value=10, label="Current CPU Load (%)")
	s_cpu_avg = gr.Slider(0, 100, value=10, label="Sustained CPU Load (Last 1 min avg) (%)")
	s_ram = gr.Slider(0, 100, value=30, label="RAM Usage (%)")
	s_temp = gr.Slider(30, 100, value=50, label="Current Temperature (°C)")
	s_change = gr.Slider(-2, 5, value=0, step=0.5, label="Temp Change Rate (°C/sec)")

	btn = gr.Button("Run Prediction", variant="primary")

	with gr.Column():
	gr.Markdown("### AI Diagnosis")
	out_status = gr.Textbox(label="Status")
	out_prob = gr.Textbox(label="Failure Probability")

	gr.Markdown("""
	Architecture: Random forest Classifier
	Trained on: 10,000+ Real-world Linux Telemetry Points
	""")

	# Connect the button to the function
	btn.click(fn=predict_failure,
	inputs=[s_cpu, s_cpu_avg, s_ram, s_temp, s_change],
	outputs=[out_status, out_prob])

	# Launch
	demo.launch()