Spaces:
Sleeping
Sleeping
File size: 14,750 Bytes
d4aab9e 6380821 d4aab9e f49f5f4 e6e175b 9ef2934 e705a27 d4aab9e e705a27 d4aab9e e705a27 d4aab9e e6e175b e705a27 e6e175b e705a27 e6e175b e705a27 e6e175b e705a27 e6e175b e705a27 e6e175b e705a27 e6e175b 9ef2934 e705a27 e6e175b e705a27 e6e175b d4aab9e e6e175b e705a27 e6e175b e705a27 e6e175b e705a27 e6e175b e705a27 e6e175b 636d9ec e6e175b 636d9ec e6e175b 6d45d63 636d9ec e6e175b 6c5a951 e6e175b 6c5a951 91d0519 e6e175b 91d0519 e6e175b e705a27 f49f5f4 9ef2934 f49f5f4 6c5a951 d6999f2 e6e175b f49f5f4 d6999f2 e6e175b d6999f2 9ef2934 f49f5f4 e6e175b 636d9ec d6999f2 e705a27 d6999f2 636d9ec d6999f2 e705a27 6c5a951 e705a27 6c5a951 e6e175b d6999f2 9ef2934 e705a27 6c5a951 e705a27 6c5a951 9ef2934 636d9ec e6e175b 636d9ec d6999f2 9ef2934 f49f5f4 6c5a951 636d9ec 6c5a951 9ef2934 e705a27 9ef2934 e6e175b e705a27 e6e175b e705a27 e6e175b 9ef2934 e6e175b 636d9ec e705a27 9ef2934 e6e175b 9ef2934 6c5a951 e6e175b e705a27 e6e175b e705a27 e6e175b 9ef2934 6c5a951 636d9ec 9ef2934 e6e175b 9ef2934 e6e175b 9ef2934 6c5a951 f49f5f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 | import streamlit as st
from streamlit_autorefresh import st_autorefresh
import pymongo
import requests
import chromadb
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import pandas as pd
import plotly.express as px
# plotly.graph_objects (go) is not strictly needed for this px.timeline approach but good to keep for flexibility
# import plotly.graph_objects as go
# Load environment variables at the very beginning
load_dotenv()
# Page config
st.set_page_config(
page_title="System Health Dashboard",
page_icon="π",
layout="wide"
)
# Initialize logs directory and files
try:
os.makedirs('logs', exist_ok=True)
if not os.path.exists('logs/system_health.json'):
with open('logs/system_health.json', 'w') as f:
json.dump({}, f)
except Exception as e_init:
st.error(f"CRITICAL ERROR during log directory/file initialization: {e_init}")
st.stop()
# --- STATUS CHECK FUNCTIONS (Keep as they are - confirmed working) ---
def check_mongo_status():
try:
mongo_uri = os.getenv('MONGO_URI')
if not mongo_uri: return False, 0, 0
client = pymongo.MongoClient(mongo_uri, serverSelectionTimeoutMS=2000)
client.admin.command('ping')
db_name = os.getenv('MONGO_DB_NAME', "job_scraper")
db = client[db_name]
jobs_collection_name = os.getenv('MONGO_JOBS_COLLECTION', "jobs")
jobs_collection = db[jobs_collection_name]
total_jobs = jobs_collection.count_documents({})
missing_html = jobs_collection.count_documents({"html_content": {"$exists": False}})
return True, total_jobs, missing_html
except Exception: return False, 0, 0
def check_chroma_status():
try:
chroma_host = os.getenv('CHROMA_HOST')
if not chroma_host: return False
client = chromadb.HttpClient(host=chroma_host, ssl=False)
client.heartbeat()
return True
except Exception: return False
def check_api_status():
try:
api_health_url = os.getenv('EMBEDDING_API_URL_HEALTH')
if not api_health_url: return False
response = requests.get(api_health_url, verify=False, timeout=5)
return response.ok or response.status_code == 405
except Exception: return False
def check_llm_status():
try:
llm_health_url = os.getenv('LLM_API_URL_HEALTH')
if not llm_health_url: return False
response = requests.get(llm_health_url, verify=False, timeout=5)
return response.ok or response.status_code == 405
except Exception: return False
# --- SAVE SYSTEM HEALTH FUNCTION (Keep as is - confirmed working) ---
def save_system_health(mongo_status, chroma_status, api_status, llm_status):
filepath = 'logs/system_health.json'
try:
current_time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
health_data = {}
if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
try:
with open(filepath, 'r') as f: health_data = json.load(f)
except json.JSONDecodeError: health_data = {}
if not isinstance(health_data, dict): health_data = {}
health_data[current_time_str] = {
'mongo': mongo_status, 'chroma': chroma_status,
'api': api_status, 'llm': llm_status
}
cutoff_time_dt = datetime.now() - timedelta(hours=24)
health_data_pruned = {}
for k_str, v_dict in health_data.items():
try:
parsed_dt = datetime.strptime(k_str, '%Y-%m-%d %H:%M:%S')
if parsed_dt >= cutoff_time_dt: health_data_pruned[k_str] = v_dict
except ValueError:
try: # Fallback for older format
parsed_dt = datetime.strptime(k_str, '%Y-%m-%d %H:%M')
if parsed_dt >= cutoff_time_dt: health_data_pruned[k_str] = v_dict
except ValueError: continue
with open(filepath, 'w') as f: json.dump(health_data_pruned, f, indent=2)
except Exception as e: st.sidebar.error(f"Error in save_system_health: {e}")
# --- NEW TIMELINE PLOT STATUS FUNCTION ---
def plot_status_timeline(df_service, service_name_for_plot, chart_title, container):
"""
Plots a timeline/Gantt chart style status graph for a service.
`service_name_for_plot` is used as the Y-axis category for the timeline.
`chart_title` is the overall title for the chart in the container.
"""
# container.markdown(f"--- DEBUG: plot_status_timeline for **{chart_title}** ---")
# container.write(f"Input df_service shape: {df_service.shape}")
# if not df_service.empty: container.dataframe(df_service.head())
if df_service.empty:
container.info(f"No data available for {chart_title}.")
return
# Ensure sorted by time
df_service = df_service.sort_values('Time').reset_index(drop=True)
timeline_data = []
# Define the end of our observation window for the last segment
# Ensure it's timezone-aware if your data is, or make them both naive.
# For simplicity, assuming naive datetimes from datetime.now() and in data.
window_end_time = datetime.now() + timedelta(minutes=2) # Extend slightly beyond current time
if len(df_service) == 1:
# Single data point: bar from its time to the end of the window
row = df_service.iloc[0]
timeline_data.append(dict(
Task=service_name_for_plot, # Y-axis category
Start=row['Time'],
Finish=window_end_time,
Status=row['ReadableStatus'] # For coloring
))
else:
# Multiple data points: create segments
for i in range(len(df_service)):
current_row = df_service.iloc[i]
start_time = current_row['Time']
status = current_row['ReadableStatus']
if i < len(df_service) - 1:
next_row = df_service.iloc[i+1]
end_time = next_row['Time'] # Segment ends when next status is recorded
else:
# Last segment, extends to the end of our observation window
end_time = window_end_time
# Only add segment if start_time is before end_time (should usually be true)
if start_time < end_time:
timeline_data.append(dict(
Task=service_name_for_plot,
Start=start_time,
Finish=end_time,
Status=status
))
if not timeline_data:
container.info(f"Not enough data to create timeline segments for {chart_title}.")
return
df_timeline = pd.DataFrame(timeline_data)
# container.write("Generated df_timeline:")
# container.dataframe(df_timeline)
try:
fig = px.timeline(
df_timeline,
x_start="Start",
x_end="Finish",
y="Task", # This will be the service_name_for_plot
color="Status",
color_discrete_map={"LIVE": "green", "DISCONNECTED": "red", "UNKNOWN": "grey"},
title=chart_title
)
fig.update_layout(
showlegend=False,
xaxis_title="Time",
yaxis_title="", # Service name is clear from the bar label or title
xaxis_range=[ # Set a fixed 3-hour window for consistency
datetime.now() - timedelta(hours=3, minutes=5),
datetime.now() + timedelta(minutes=5)
],
margin=dict(l=20, r=20, t=50, b=20) # Adjust t for title
)
# Make bars fill more of the vertical space if there's only one task (service) per plot
fig.update_yaxes(categoryorder='array', categoryarray=[service_name_for_plot])
container.plotly_chart(fig, use_container_width=True)
except Exception as e_plot:
container.error(f"Error plotting timeline for {chart_title}: {e_plot}")
# container.write("Data that caused plotting error (df_timeline):")
# container.dataframe(df_timeline)
# --- MAIN APPLICATION LOGIC ---
def main():
st_autorefresh(interval=10_000, key="health_watch")
st.title("System Health Dashboard")
current_timestamp_display = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
st.caption(f"Last checked: {current_timestamp_display}")
# Service Status Checks
col1_status, col2_status, col3_status, col4_status = st.columns(4)
mongo_status, total_jobs, missing_html = check_mongo_status()
chroma_status = check_chroma_status()
api_status = check_api_status()
llm_status = check_llm_status()
save_system_health(mongo_status, chroma_status, api_status, llm_status) # This is working
with col1_status: st.metric("MongoDB Status", "LIVE β
" if mongo_status else "DISCONNECTED β")
with col2_status: st.metric("ChromaDB Status", "LIVE β
" if chroma_status else "DISCONNECTED β")
with col3_status: st.metric("Embedding API Status", "LIVE β
" if api_status else "DISCONNECTED β")
with col4_status: st.metric("LLM API Status", "LIVE β
" if llm_status else "DISCONNECTED β")
# Database Coverage (Keep as is)
st.subheader("Database Coverage")
c1_db, c2_db = st.columns(2)
chroma_count_val = 0
coverage = 0.0
if chroma_status:
try:
chroma_host = os.getenv('CHROMA_HOST')
chroma_client_obj = chromadb.HttpClient(host=chroma_host, ssl=False)
collection_name_env = os.getenv('CHROMA_COLLECTION')
if collection_name_env:
collection_obj = chroma_client_obj.get_collection(name=collection_name_env)
chroma_count_val = collection_obj.count()
else: st.sidebar.warning("CHROMA_COLLECTION env var not set for count.")
except Exception as e_chroma_count:
st.error(f"Error getting ChromaDB count: {e_chroma_count}")
chroma_count_val = "Error"
if total_jobs > 0 and isinstance(chroma_count_val, int): coverage = (chroma_count_val / total_jobs * 100)
elif isinstance(chroma_count_val, int) and chroma_count_val > 0 and total_jobs == 0: coverage = "N/A (No jobs)"
elif isinstance(chroma_count_val, str): coverage = "N/A"
with c1_db: st.metric("Embedded Jobs (Chroma)", f"{chroma_count_val:,}" if isinstance(chroma_count_val, int) else chroma_count_val)
with c2_db: st.metric("Embedding Coverage", f"{coverage:.1f}%" if isinstance(coverage, float) else coverage)
# MongoDB Statistics (Keep as is)
st.subheader("MongoDB Statistics")
sc1_mongo, sc2_mongo = st.columns(2)
with sc1_mongo: st.metric("Total Jobs", f"{total_jobs:,}")
with sc2_mongo: st.metric("Jobs Missing HTML", f"{missing_html:,}")
# System Health History
st.subheader("System Health History (Last 3 Hours)")
health_data_main = {}
filepath_main = 'logs/system_health.json'
if os.path.exists(filepath_main) and os.path.getsize(filepath_main) > 2:
try:
with open(filepath_main, 'r') as f: health_data_main = json.load(f)
except json.JSONDecodeError:
st.sidebar.error(f"Error: Corrupted {filepath_main}. History might be incomplete.")
health_data_main = {}
except Exception as e_load_main:
st.sidebar.error(f"Error loading {filepath_main}: {e_load_main}")
health_data_main = {}
df_list_main = []
if isinstance(health_data_main, dict) and health_data_main:
three_hours_ago_main = datetime.now() - timedelta(hours=3)
# Ensure keys are sorted by time before processing for df_list_main
# This is crucial for the segment logic in plot_status_timeline
sorted_health_keys = sorted(health_data_main.keys())
for k_str in sorted_health_keys:
v_dict = health_data_main[k_str]
try:
parsed_timestamp_val = datetime.strptime(k_str, '%Y-%m-%d %H:%M:%S')
if parsed_timestamp_val >= three_hours_ago_main and isinstance(v_dict, dict):
for svc, status_bool in v_dict.items():
df_list_main.append({
'Time': parsed_timestamp_val, 'Service': svc,
'ReadableStatus': 'LIVE' if status_bool else 'DISCONNECTED'
# StatusNumeric is not directly used by px.timeline, but keep if other parts need it
# 'StatusNumeric': 1 if status_bool else 0
})
except (ValueError, TypeError):
try:
parsed_timestamp_val = datetime.strptime(k_str, '%Y-%m-%d %H:%M')
if parsed_timestamp_val >= three_hours_ago_main and isinstance(v_dict, dict):
for svc, status_bool in v_dict.items():
df_list_main.append({
'Time': parsed_timestamp_val, 'Service': svc,
'ReadableStatus': 'LIVE' if status_bool else 'DISCONNECTED'
})
except (ValueError, TypeError): continue
if not df_list_main:
st.info("No system health history data available for the last 3 hours to plot.")
else:
df_health_main = pd.DataFrame(df_list_main)
if not df_health_main.empty:
df_health_main['Time'] = pd.to_datetime(df_health_main['Time'])
# Ensure each service's data is sorted by time before passing to plot function
# This is critical for the logic inside plot_status_timeline that determines segments
df_health_main = df_health_main.sort_values(by=['Service', 'Time'])
hc1_hist, hc2_hist, hc3_hist, hc4_hist = st.columns(4)
services_map = {
'mongo': ('MongoDB', 'MongoDB Health History'),
'chroma': ('ChromaDB', 'ChromaDB Health History'),
'api': ('Embedding API', 'Embedding API Health History'),
'llm': ('LLM API', 'LLM API Health History')
}
containers = [hc1_hist, hc2_hist, hc3_hist, hc4_hist]
for i, (service_key, (plot_y_label, chart_title_text)) in enumerate(services_map.items()):
service_df = df_health_main[df_health_main['Service'] == service_key].copy()
# plot_status_timeline expects df_service to be sorted by Time
service_df = service_df.sort_values('Time')
plot_status_timeline(service_df, plot_y_label, chart_title_text, containers[i])
else:
st.info("Health history data processed into an empty DataFrame; nothing to plot.")
if __name__ == "__main__":
main() |