Spaces:

Tryfonas
/

CV_Evaluator_Health_Check

Sleeping

File size: 14,750 Bytes

d4aab9e
6380821
d4aab9e
 
 
 
 
 
 
 
f49f5f4
e6e175b
 
9ef2934
e705a27
 
d4aab9e
e705a27
 
 
 
 
 
d4aab9e
e705a27
 
 
 
 
 
 
 
 
d4aab9e
e6e175b
e705a27
 
 
e6e175b
e705a27
 
 
 
 
 
 
 
 
e6e175b
e705a27
 
 
 
e6e175b
 
e705a27
 
e6e175b
e705a27
 
 
 
e6e175b
e705a27
e6e175b
 
9ef2934
e705a27
 
 
e6e175b
e705a27
e6e175b
 
d4aab9e
e6e175b
e705a27
 
 
 
 
e6e175b
 
 
 
 
e705a27
e6e175b
 
e705a27
 
 
 
 
e6e175b
 
 
 
e705a27
e6e175b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636d9ec
e6e175b
636d9ec
 
e6e175b
6d45d63
636d9ec
e6e175b
 
 
 
 
6c5a951
e6e175b
 
 
 
 
 
 
 
6c5a951
91d0519
e6e175b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d0519
e6e175b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e705a27
 
f49f5f4
 
 
 
9ef2934
 
f49f5f4
6c5a951
d6999f2
e6e175b
f49f5f4
 
 
d6999f2
e6e175b
d6999f2
9ef2934
 
 
 
f49f5f4
e6e175b
636d9ec
d6999f2
e705a27
d6999f2
 
636d9ec
d6999f2
e705a27
 
6c5a951
e705a27
6c5a951
e6e175b
d6999f2
9ef2934
e705a27
6c5a951
 
e705a27
6c5a951
9ef2934
636d9ec
e6e175b
636d9ec
d6999f2
9ef2934
 
f49f5f4
6c5a951
636d9ec
6c5a951
9ef2934
 
e705a27
9ef2934
e6e175b
e705a27
e6e175b
e705a27
 
e6e175b
9ef2934
 
 
 
 
e6e175b
 
 
 
 
 
636d9ec
e705a27
9ef2934
 
 
e6e175b
 
 
 
9ef2934
6c5a951
e6e175b
e705a27
 
 
 
e6e175b
 
e705a27
e6e175b
9ef2934
 
6c5a951
636d9ec
9ef2934
 
 
e6e175b
 
 
 
 
9ef2934
e6e175b
 
 
 
 
 
 
 
 
 
 
 
 
 
9ef2934
6c5a951
f49f5f4

import streamlit as st
from streamlit_autorefresh import st_autorefresh
import pymongo
import requests
import chromadb
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import pandas as pd
import plotly.express as px
# plotly.graph_objects (go) is not strictly needed for this px.timeline approach but good to keep for flexibility
# import plotly.graph_objects as go

# Load environment variables at the very beginning
load_dotenv()

# Page config
st.set_page_config(
    page_title="System Health Dashboard",
    page_icon="🔍",
    layout="wide"
)

# Initialize logs directory and files
try:
    os.makedirs('logs', exist_ok=True)
    if not os.path.exists('logs/system_health.json'):
        with open('logs/system_health.json', 'w') as f:
            json.dump({}, f)
except Exception as e_init:
    st.error(f"CRITICAL ERROR during log directory/file initialization: {e_init}")
    st.stop()

# --- STATUS CHECK FUNCTIONS (Keep as they are - confirmed working) ---
def check_mongo_status():
    try:
        mongo_uri = os.getenv('MONGO_URI')
        if not mongo_uri: return False, 0, 0
        client = pymongo.MongoClient(mongo_uri, serverSelectionTimeoutMS=2000)
        client.admin.command('ping')
        db_name = os.getenv('MONGO_DB_NAME', "job_scraper")
        db = client[db_name]
        jobs_collection_name = os.getenv('MONGO_JOBS_COLLECTION', "jobs")
        jobs_collection = db[jobs_collection_name]
        total_jobs = jobs_collection.count_documents({})
        missing_html = jobs_collection.count_documents({"html_content": {"$exists": False}})
        return True, total_jobs, missing_html
    except Exception: return False, 0, 0

def check_chroma_status():
    try:
        chroma_host = os.getenv('CHROMA_HOST')
        if not chroma_host: return False
        client = chromadb.HttpClient(host=chroma_host, ssl=False)
        client.heartbeat()
        return True
    except Exception: return False

def check_api_status():
    try:
        api_health_url = os.getenv('EMBEDDING_API_URL_HEALTH')
        if not api_health_url: return False
        response = requests.get(api_health_url, verify=False, timeout=5)
        return response.ok or response.status_code == 405
    except Exception: return False

def check_llm_status():
    try:
        llm_health_url = os.getenv('LLM_API_URL_HEALTH')
        if not llm_health_url: return False
        response = requests.get(llm_health_url, verify=False, timeout=5)
        return response.ok or response.status_code == 405
    except Exception: return False

# --- SAVE SYSTEM HEALTH FUNCTION (Keep as is - confirmed working) ---
def save_system_health(mongo_status, chroma_status, api_status, llm_status):
    filepath = 'logs/system_health.json'
    try:
        current_time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        health_data = {}
        if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
            try:
                with open(filepath, 'r') as f: health_data = json.load(f)
            except json.JSONDecodeError: health_data = {}
        if not isinstance(health_data, dict): health_data = {}
        health_data[current_time_str] = {
            'mongo': mongo_status, 'chroma': chroma_status,
            'api': api_status, 'llm': llm_status
        }
        cutoff_time_dt = datetime.now() - timedelta(hours=24)
        health_data_pruned = {}
        for k_str, v_dict in health_data.items():
            try:
                parsed_dt = datetime.strptime(k_str, '%Y-%m-%d %H:%M:%S')
                if parsed_dt >= cutoff_time_dt: health_data_pruned[k_str] = v_dict
            except ValueError:
                try: # Fallback for older format
                    parsed_dt = datetime.strptime(k_str, '%Y-%m-%d %H:%M')
                    if parsed_dt >= cutoff_time_dt: health_data_pruned[k_str] = v_dict
                except ValueError: continue
        with open(filepath, 'w') as f: json.dump(health_data_pruned, f, indent=2)
    except Exception as e: st.sidebar.error(f"Error in save_system_health: {e}")


# --- NEW TIMELINE PLOT STATUS FUNCTION ---
def plot_status_timeline(df_service, service_name_for_plot, chart_title, container):
    """
    Plots a timeline/Gantt chart style status graph for a service.
    `service_name_for_plot` is used as the Y-axis category for the timeline.
    `chart_title` is the overall title for the chart in the container.
    """
    # container.markdown(f"--- DEBUG: plot_status_timeline for **{chart_title}** ---")
    # container.write(f"Input df_service shape: {df_service.shape}")
    # if not df_service.empty: container.dataframe(df_service.head())

    if df_service.empty:
        container.info(f"No data available for {chart_title}.")
        return

    # Ensure sorted by time
    df_service = df_service.sort_values('Time').reset_index(drop=True)

    timeline_data = []
    # Define the end of our observation window for the last segment
    # Ensure it's timezone-aware if your data is, or make them both naive.
    # For simplicity, assuming naive datetimes from datetime.now() and in data.
    window_end_time = datetime.now() + timedelta(minutes=2) # Extend slightly beyond current time

    if len(df_service) == 1:
        # Single data point: bar from its time to the end of the window
        row = df_service.iloc[0]
        timeline_data.append(dict(
            Task=service_name_for_plot, # Y-axis category
            Start=row['Time'],
            Finish=window_end_time,
            Status=row['ReadableStatus'] # For coloring
        ))
    else:
        # Multiple data points: create segments
        for i in range(len(df_service)):
            current_row = df_service.iloc[i]
            start_time = current_row['Time']
            status = current_row['ReadableStatus']

            if i < len(df_service) - 1:
                next_row = df_service.iloc[i+1]
                end_time = next_row['Time'] # Segment ends when next status is recorded
            else:
                # Last segment, extends to the end of our observation window
                end_time = window_end_time
            
            # Only add segment if start_time is before end_time (should usually be true)
            if start_time < end_time:
                timeline_data.append(dict(
                    Task=service_name_for_plot,
                    Start=start_time,
                    Finish=end_time,
                    Status=status
                ))

    if not timeline_data:
        container.info(f"Not enough data to create timeline segments for {chart_title}.")
        return

    df_timeline = pd.DataFrame(timeline_data)
    # container.write("Generated df_timeline:")
    # container.dataframe(df_timeline)

    try:
        fig = px.timeline(
            df_timeline,
            x_start="Start",
            x_end="Finish",
            y="Task",  # This will be the service_name_for_plot
            color="Status",
            color_discrete_map={"LIVE": "green", "DISCONNECTED": "red", "UNKNOWN": "grey"},
            title=chart_title
        )

        fig.update_layout(
            showlegend=False,
            xaxis_title="Time",
            yaxis_title="", # Service name is clear from the bar label or title
            xaxis_range=[ # Set a fixed 3-hour window for consistency
                datetime.now() - timedelta(hours=3, minutes=5),
                datetime.now() + timedelta(minutes=5)
            ],
            margin=dict(l=20, r=20, t=50, b=20) # Adjust t for title
        )
        # Make bars fill more of the vertical space if there's only one task (service) per plot
        fig.update_yaxes(categoryorder='array', categoryarray=[service_name_for_plot])


        container.plotly_chart(fig, use_container_width=True)
    except Exception as e_plot:
        container.error(f"Error plotting timeline for {chart_title}: {e_plot}")
        # container.write("Data that caused plotting error (df_timeline):")
        # container.dataframe(df_timeline)


# --- MAIN APPLICATION LOGIC ---
def main():
    st_autorefresh(interval=10_000, key="health_watch")

    st.title("System Health Dashboard")
    current_timestamp_display = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    st.caption(f"Last checked: {current_timestamp_display}")

    # Service Status Checks
    col1_status, col2_status, col3_status, col4_status = st.columns(4)
    mongo_status, total_jobs, missing_html = check_mongo_status()
    chroma_status = check_chroma_status()
    api_status = check_api_status()
    llm_status = check_llm_status()
    
    save_system_health(mongo_status, chroma_status, api_status, llm_status) # This is working

    with col1_status: st.metric("MongoDB Status", "LIVE ✅" if mongo_status else "DISCONNECTED ❌")
    with col2_status: st.metric("ChromaDB Status", "LIVE ✅" if chroma_status else "DISCONNECTED ❌")
    with col3_status: st.metric("Embedding API Status", "LIVE ✅" if api_status else "DISCONNECTED ❌")
    with col4_status: st.metric("LLM API Status", "LIVE ✅" if llm_status else "DISCONNECTED ❌")

    # Database Coverage (Keep as is)
    st.subheader("Database Coverage")
    c1_db, c2_db = st.columns(2)
    chroma_count_val = 0
    coverage = 0.0
    if chroma_status:
        try:
            chroma_host = os.getenv('CHROMA_HOST')
            chroma_client_obj = chromadb.HttpClient(host=chroma_host, ssl=False)
            collection_name_env = os.getenv('CHROMA_COLLECTION')
            if collection_name_env:
                collection_obj = chroma_client_obj.get_collection(name=collection_name_env)
                chroma_count_val = collection_obj.count()
            else: st.sidebar.warning("CHROMA_COLLECTION env var not set for count.")
        except Exception as e_chroma_count:
            st.error(f"Error getting ChromaDB count: {e_chroma_count}")
            chroma_count_val = "Error"
    if total_jobs > 0 and isinstance(chroma_count_val, int): coverage = (chroma_count_val / total_jobs * 100)
    elif isinstance(chroma_count_val, int) and chroma_count_val > 0 and total_jobs == 0: coverage = "N/A (No jobs)"
    elif isinstance(chroma_count_val, str): coverage = "N/A"
    with c1_db: st.metric("Embedded Jobs (Chroma)", f"{chroma_count_val:,}" if isinstance(chroma_count_val, int) else chroma_count_val)
    with c2_db: st.metric("Embedding Coverage", f"{coverage:.1f}%" if isinstance(coverage, float) else coverage)

    # MongoDB Statistics (Keep as is)
    st.subheader("MongoDB Statistics")
    sc1_mongo, sc2_mongo = st.columns(2)
    with sc1_mongo: st.metric("Total Jobs", f"{total_jobs:,}")
    with sc2_mongo: st.metric("Jobs Missing HTML", f"{missing_html:,}")

    # System Health History
    st.subheader("System Health History (Last 3 Hours)")
    
    health_data_main = {}
    filepath_main = 'logs/system_health.json'
    if os.path.exists(filepath_main) and os.path.getsize(filepath_main) > 2:
        try:
            with open(filepath_main, 'r') as f: health_data_main = json.load(f)
        except json.JSONDecodeError:
            st.sidebar.error(f"Error: Corrupted {filepath_main}. History might be incomplete.")
            health_data_main = {}
        except Exception as e_load_main:
            st.sidebar.error(f"Error loading {filepath_main}: {e_load_main}")
            health_data_main = {}
    
    df_list_main = []
    if isinstance(health_data_main, dict) and health_data_main:
        three_hours_ago_main = datetime.now() - timedelta(hours=3)
        # Ensure keys are sorted by time before processing for df_list_main
        # This is crucial for the segment logic in plot_status_timeline
        sorted_health_keys = sorted(health_data_main.keys())

        for k_str in sorted_health_keys:
            v_dict = health_data_main[k_str]
            try:
                parsed_timestamp_val = datetime.strptime(k_str, '%Y-%m-%d %H:%M:%S')
                if parsed_timestamp_val >= three_hours_ago_main and isinstance(v_dict, dict):
                    for svc, status_bool in v_dict.items():
                        df_list_main.append({
                            'Time': parsed_timestamp_val, 'Service': svc,
                            'ReadableStatus': 'LIVE' if status_bool else 'DISCONNECTED'
                            # StatusNumeric is not directly used by px.timeline, but keep if other parts need it
                            # 'StatusNumeric': 1 if status_bool else 0
                        })
            except (ValueError, TypeError): 
                try:
                    parsed_timestamp_val = datetime.strptime(k_str, '%Y-%m-%d %H:%M')
                    if parsed_timestamp_val >= three_hours_ago_main and isinstance(v_dict, dict):
                        for svc, status_bool in v_dict.items():
                            df_list_main.append({
                                'Time': parsed_timestamp_val, 'Service': svc,
                                'ReadableStatus': 'LIVE' if status_bool else 'DISCONNECTED'
                            })
                except (ValueError, TypeError): continue
    
    if not df_list_main:
        st.info("No system health history data available for the last 3 hours to plot.")
    else:
        df_health_main = pd.DataFrame(df_list_main)
        if not df_health_main.empty:
            df_health_main['Time'] = pd.to_datetime(df_health_main['Time'])
            
            # Ensure each service's data is sorted by time before passing to plot function
            # This is critical for the logic inside plot_status_timeline that determines segments
            df_health_main = df_health_main.sort_values(by=['Service', 'Time'])

            hc1_hist, hc2_hist, hc3_hist, hc4_hist = st.columns(4)
            
            services_map = {
                'mongo': ('MongoDB', 'MongoDB Health History'),
                'chroma': ('ChromaDB', 'ChromaDB Health History'),
                'api': ('Embedding API', 'Embedding API Health History'),
                'llm': ('LLM API', 'LLM API Health History')
            }
            containers = [hc1_hist, hc2_hist, hc3_hist, hc4_hist]
            
            for i, (service_key, (plot_y_label, chart_title_text)) in enumerate(services_map.items()):
                service_df = df_health_main[df_health_main['Service'] == service_key].copy()
                # plot_status_timeline expects df_service to be sorted by Time
                service_df = service_df.sort_values('Time') 
                plot_status_timeline(service_df, plot_y_label, chart_title_text, containers[i])
        else:
            st.info("Health history data processed into an empty DataFrame; nothing to plot.")

if __name__ == "__main__":
    main()