File size: 14,750 Bytes
d4aab9e
6380821
d4aab9e
 
 
 
 
 
 
 
f49f5f4
e6e175b
 
9ef2934
e705a27
 
d4aab9e
e705a27
 
 
 
 
 
d4aab9e
e705a27
 
 
 
 
 
 
 
 
d4aab9e
e6e175b
e705a27
 
 
e6e175b
e705a27
 
 
 
 
 
 
 
 
e6e175b
e705a27
 
 
 
e6e175b
 
e705a27
 
e6e175b
e705a27
 
 
 
e6e175b
e705a27
e6e175b
 
9ef2934
e705a27
 
 
e6e175b
e705a27
e6e175b
 
d4aab9e
e6e175b
e705a27
 
 
 
 
e6e175b
 
 
 
 
e705a27
e6e175b
 
e705a27
 
 
 
 
e6e175b
 
 
 
e705a27
e6e175b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636d9ec
e6e175b
636d9ec
 
e6e175b
6d45d63
636d9ec
e6e175b
 
 
 
 
6c5a951
e6e175b
 
 
 
 
 
 
 
6c5a951
91d0519
e6e175b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d0519
e6e175b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e705a27
 
f49f5f4
 
 
 
9ef2934
 
f49f5f4
6c5a951
d6999f2
e6e175b
f49f5f4
 
 
d6999f2
e6e175b
d6999f2
9ef2934
 
 
 
f49f5f4
e6e175b
636d9ec
d6999f2
e705a27
d6999f2
 
636d9ec
d6999f2
e705a27
 
6c5a951
e705a27
6c5a951
e6e175b
d6999f2
9ef2934
e705a27
6c5a951
 
e705a27
6c5a951
9ef2934
636d9ec
e6e175b
636d9ec
d6999f2
9ef2934
 
f49f5f4
6c5a951
636d9ec
6c5a951
9ef2934
 
e705a27
9ef2934
e6e175b
e705a27
e6e175b
e705a27
 
e6e175b
9ef2934
 
 
 
 
e6e175b
 
 
 
 
 
636d9ec
e705a27
9ef2934
 
 
e6e175b
 
 
 
9ef2934
6c5a951
e6e175b
e705a27
 
 
 
e6e175b
 
e705a27
e6e175b
9ef2934
 
6c5a951
636d9ec
9ef2934
 
 
e6e175b
 
 
 
 
9ef2934
e6e175b
 
 
 
 
 
 
 
 
 
 
 
 
 
9ef2934
6c5a951
f49f5f4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
import streamlit as st
from streamlit_autorefresh import st_autorefresh
import pymongo
import requests
import chromadb
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import pandas as pd
import plotly.express as px
# plotly.graph_objects (go) is not strictly needed for this px.timeline approach but good to keep for flexibility
# import plotly.graph_objects as go

# Load environment variables at the very beginning
load_dotenv()

# Page config
st.set_page_config(
    page_title="System Health Dashboard",
    page_icon="πŸ”",
    layout="wide"
)

# Initialize logs directory and files
try:
    os.makedirs('logs', exist_ok=True)
    if not os.path.exists('logs/system_health.json'):
        with open('logs/system_health.json', 'w') as f:
            json.dump({}, f)
except Exception as e_init:
    st.error(f"CRITICAL ERROR during log directory/file initialization: {e_init}")
    st.stop()

# --- STATUS CHECK FUNCTIONS (Keep as they are - confirmed working) ---
def check_mongo_status():
    try:
        mongo_uri = os.getenv('MONGO_URI')
        if not mongo_uri: return False, 0, 0
        client = pymongo.MongoClient(mongo_uri, serverSelectionTimeoutMS=2000)
        client.admin.command('ping')
        db_name = os.getenv('MONGO_DB_NAME', "job_scraper")
        db = client[db_name]
        jobs_collection_name = os.getenv('MONGO_JOBS_COLLECTION', "jobs")
        jobs_collection = db[jobs_collection_name]
        total_jobs = jobs_collection.count_documents({})
        missing_html = jobs_collection.count_documents({"html_content": {"$exists": False}})
        return True, total_jobs, missing_html
    except Exception: return False, 0, 0

def check_chroma_status():
    try:
        chroma_host = os.getenv('CHROMA_HOST')
        if not chroma_host: return False
        client = chromadb.HttpClient(host=chroma_host, ssl=False)
        client.heartbeat()
        return True
    except Exception: return False

def check_api_status():
    try:
        api_health_url = os.getenv('EMBEDDING_API_URL_HEALTH')
        if not api_health_url: return False
        response = requests.get(api_health_url, verify=False, timeout=5)
        return response.ok or response.status_code == 405
    except Exception: return False

def check_llm_status():
    try:
        llm_health_url = os.getenv('LLM_API_URL_HEALTH')
        if not llm_health_url: return False
        response = requests.get(llm_health_url, verify=False, timeout=5)
        return response.ok or response.status_code == 405
    except Exception: return False

# --- SAVE SYSTEM HEALTH FUNCTION (Keep as is - confirmed working) ---
def save_system_health(mongo_status, chroma_status, api_status, llm_status):
    filepath = 'logs/system_health.json'
    try:
        current_time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        health_data = {}
        if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
            try:
                with open(filepath, 'r') as f: health_data = json.load(f)
            except json.JSONDecodeError: health_data = {}
        if not isinstance(health_data, dict): health_data = {}
        health_data[current_time_str] = {
            'mongo': mongo_status, 'chroma': chroma_status,
            'api': api_status, 'llm': llm_status
        }
        cutoff_time_dt = datetime.now() - timedelta(hours=24)
        health_data_pruned = {}
        for k_str, v_dict in health_data.items():
            try:
                parsed_dt = datetime.strptime(k_str, '%Y-%m-%d %H:%M:%S')
                if parsed_dt >= cutoff_time_dt: health_data_pruned[k_str] = v_dict
            except ValueError:
                try: # Fallback for older format
                    parsed_dt = datetime.strptime(k_str, '%Y-%m-%d %H:%M')
                    if parsed_dt >= cutoff_time_dt: health_data_pruned[k_str] = v_dict
                except ValueError: continue
        with open(filepath, 'w') as f: json.dump(health_data_pruned, f, indent=2)
    except Exception as e: st.sidebar.error(f"Error in save_system_health: {e}")


# --- NEW TIMELINE PLOT STATUS FUNCTION ---
def plot_status_timeline(df_service, service_name_for_plot, chart_title, container):
    """
    Plots a timeline/Gantt chart style status graph for a service.
    `service_name_for_plot` is used as the Y-axis category for the timeline.
    `chart_title` is the overall title for the chart in the container.
    """
    # container.markdown(f"--- DEBUG: plot_status_timeline for **{chart_title}** ---")
    # container.write(f"Input df_service shape: {df_service.shape}")
    # if not df_service.empty: container.dataframe(df_service.head())

    if df_service.empty:
        container.info(f"No data available for {chart_title}.")
        return

    # Ensure sorted by time
    df_service = df_service.sort_values('Time').reset_index(drop=True)

    timeline_data = []
    # Define the end of our observation window for the last segment
    # Ensure it's timezone-aware if your data is, or make them both naive.
    # For simplicity, assuming naive datetimes from datetime.now() and in data.
    window_end_time = datetime.now() + timedelta(minutes=2) # Extend slightly beyond current time

    if len(df_service) == 1:
        # Single data point: bar from its time to the end of the window
        row = df_service.iloc[0]
        timeline_data.append(dict(
            Task=service_name_for_plot, # Y-axis category
            Start=row['Time'],
            Finish=window_end_time,
            Status=row['ReadableStatus'] # For coloring
        ))
    else:
        # Multiple data points: create segments
        for i in range(len(df_service)):
            current_row = df_service.iloc[i]
            start_time = current_row['Time']
            status = current_row['ReadableStatus']

            if i < len(df_service) - 1:
                next_row = df_service.iloc[i+1]
                end_time = next_row['Time'] # Segment ends when next status is recorded
            else:
                # Last segment, extends to the end of our observation window
                end_time = window_end_time
            
            # Only add segment if start_time is before end_time (should usually be true)
            if start_time < end_time:
                timeline_data.append(dict(
                    Task=service_name_for_plot,
                    Start=start_time,
                    Finish=end_time,
                    Status=status
                ))

    if not timeline_data:
        container.info(f"Not enough data to create timeline segments for {chart_title}.")
        return

    df_timeline = pd.DataFrame(timeline_data)
    # container.write("Generated df_timeline:")
    # container.dataframe(df_timeline)

    try:
        fig = px.timeline(
            df_timeline,
            x_start="Start",
            x_end="Finish",
            y="Task",  # This will be the service_name_for_plot
            color="Status",
            color_discrete_map={"LIVE": "green", "DISCONNECTED": "red", "UNKNOWN": "grey"},
            title=chart_title
        )

        fig.update_layout(
            showlegend=False,
            xaxis_title="Time",
            yaxis_title="", # Service name is clear from the bar label or title
            xaxis_range=[ # Set a fixed 3-hour window for consistency
                datetime.now() - timedelta(hours=3, minutes=5),
                datetime.now() + timedelta(minutes=5)
            ],
            margin=dict(l=20, r=20, t=50, b=20) # Adjust t for title
        )
        # Make bars fill more of the vertical space if there's only one task (service) per plot
        fig.update_yaxes(categoryorder='array', categoryarray=[service_name_for_plot])


        container.plotly_chart(fig, use_container_width=True)
    except Exception as e_plot:
        container.error(f"Error plotting timeline for {chart_title}: {e_plot}")
        # container.write("Data that caused plotting error (df_timeline):")
        # container.dataframe(df_timeline)


# --- MAIN APPLICATION LOGIC ---
def main():
    st_autorefresh(interval=10_000, key="health_watch")

    st.title("System Health Dashboard")
    current_timestamp_display = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    st.caption(f"Last checked: {current_timestamp_display}")

    # Service Status Checks
    col1_status, col2_status, col3_status, col4_status = st.columns(4)
    mongo_status, total_jobs, missing_html = check_mongo_status()
    chroma_status = check_chroma_status()
    api_status = check_api_status()
    llm_status = check_llm_status()
    
    save_system_health(mongo_status, chroma_status, api_status, llm_status) # This is working

    with col1_status: st.metric("MongoDB Status", "LIVE βœ…" if mongo_status else "DISCONNECTED ❌")
    with col2_status: st.metric("ChromaDB Status", "LIVE βœ…" if chroma_status else "DISCONNECTED ❌")
    with col3_status: st.metric("Embedding API Status", "LIVE βœ…" if api_status else "DISCONNECTED ❌")
    with col4_status: st.metric("LLM API Status", "LIVE βœ…" if llm_status else "DISCONNECTED ❌")

    # Database Coverage (Keep as is)
    st.subheader("Database Coverage")
    c1_db, c2_db = st.columns(2)
    chroma_count_val = 0
    coverage = 0.0
    if chroma_status:
        try:
            chroma_host = os.getenv('CHROMA_HOST')
            chroma_client_obj = chromadb.HttpClient(host=chroma_host, ssl=False)
            collection_name_env = os.getenv('CHROMA_COLLECTION')
            if collection_name_env:
                collection_obj = chroma_client_obj.get_collection(name=collection_name_env)
                chroma_count_val = collection_obj.count()
            else: st.sidebar.warning("CHROMA_COLLECTION env var not set for count.")
        except Exception as e_chroma_count:
            st.error(f"Error getting ChromaDB count: {e_chroma_count}")
            chroma_count_val = "Error"
    if total_jobs > 0 and isinstance(chroma_count_val, int): coverage = (chroma_count_val / total_jobs * 100)
    elif isinstance(chroma_count_val, int) and chroma_count_val > 0 and total_jobs == 0: coverage = "N/A (No jobs)"
    elif isinstance(chroma_count_val, str): coverage = "N/A"
    with c1_db: st.metric("Embedded Jobs (Chroma)", f"{chroma_count_val:,}" if isinstance(chroma_count_val, int) else chroma_count_val)
    with c2_db: st.metric("Embedding Coverage", f"{coverage:.1f}%" if isinstance(coverage, float) else coverage)

    # MongoDB Statistics (Keep as is)
    st.subheader("MongoDB Statistics")
    sc1_mongo, sc2_mongo = st.columns(2)
    with sc1_mongo: st.metric("Total Jobs", f"{total_jobs:,}")
    with sc2_mongo: st.metric("Jobs Missing HTML", f"{missing_html:,}")

    # System Health History
    st.subheader("System Health History (Last 3 Hours)")
    
    health_data_main = {}
    filepath_main = 'logs/system_health.json'
    if os.path.exists(filepath_main) and os.path.getsize(filepath_main) > 2:
        try:
            with open(filepath_main, 'r') as f: health_data_main = json.load(f)
        except json.JSONDecodeError:
            st.sidebar.error(f"Error: Corrupted {filepath_main}. History might be incomplete.")
            health_data_main = {}
        except Exception as e_load_main:
            st.sidebar.error(f"Error loading {filepath_main}: {e_load_main}")
            health_data_main = {}
    
    df_list_main = []
    if isinstance(health_data_main, dict) and health_data_main:
        three_hours_ago_main = datetime.now() - timedelta(hours=3)
        # Ensure keys are sorted by time before processing for df_list_main
        # This is crucial for the segment logic in plot_status_timeline
        sorted_health_keys = sorted(health_data_main.keys())

        for k_str in sorted_health_keys:
            v_dict = health_data_main[k_str]
            try:
                parsed_timestamp_val = datetime.strptime(k_str, '%Y-%m-%d %H:%M:%S')
                if parsed_timestamp_val >= three_hours_ago_main and isinstance(v_dict, dict):
                    for svc, status_bool in v_dict.items():
                        df_list_main.append({
                            'Time': parsed_timestamp_val, 'Service': svc,
                            'ReadableStatus': 'LIVE' if status_bool else 'DISCONNECTED'
                            # StatusNumeric is not directly used by px.timeline, but keep if other parts need it
                            # 'StatusNumeric': 1 if status_bool else 0
                        })
            except (ValueError, TypeError): 
                try:
                    parsed_timestamp_val = datetime.strptime(k_str, '%Y-%m-%d %H:%M')
                    if parsed_timestamp_val >= three_hours_ago_main and isinstance(v_dict, dict):
                        for svc, status_bool in v_dict.items():
                            df_list_main.append({
                                'Time': parsed_timestamp_val, 'Service': svc,
                                'ReadableStatus': 'LIVE' if status_bool else 'DISCONNECTED'
                            })
                except (ValueError, TypeError): continue
    
    if not df_list_main:
        st.info("No system health history data available for the last 3 hours to plot.")
    else:
        df_health_main = pd.DataFrame(df_list_main)
        if not df_health_main.empty:
            df_health_main['Time'] = pd.to_datetime(df_health_main['Time'])
            
            # Ensure each service's data is sorted by time before passing to plot function
            # This is critical for the logic inside plot_status_timeline that determines segments
            df_health_main = df_health_main.sort_values(by=['Service', 'Time'])

            hc1_hist, hc2_hist, hc3_hist, hc4_hist = st.columns(4)
            
            services_map = {
                'mongo': ('MongoDB', 'MongoDB Health History'),
                'chroma': ('ChromaDB', 'ChromaDB Health History'),
                'api': ('Embedding API', 'Embedding API Health History'),
                'llm': ('LLM API', 'LLM API Health History')
            }
            containers = [hc1_hist, hc2_hist, hc3_hist, hc4_hist]
            
            for i, (service_key, (plot_y_label, chart_title_text)) in enumerate(services_map.items()):
                service_df = df_health_main[df_health_main['Service'] == service_key].copy()
                # plot_status_timeline expects df_service to be sorted by Time
                service_df = service_df.sort_values('Time') 
                plot_status_timeline(service_df, plot_y_label, chart_title_text, containers[i])
        else:
            st.info("Health history data processed into an empty DataFrame; nothing to plot.")

if __name__ == "__main__":
    main()