import streamlit as st import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split # 1. Load Real SQL Query Logs def load_data(): # Make sure this filename matches the actual file in your "data/" directory return pd.read_csv("data/sql_query_logs_2025_04_25.csv") # 2. Preprocess Data and Train the Model def preprocess_and_train_model(df): # Ensure avg_exec_time_ms is numeric df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce') # Ensure feature columns are numeric for col in ['query_length', 'num_joins', 'has_subquery', 'uses_index']: df[col] = pd.to_numeric(df[col], errors='coerce') # Drop rows with any NaNs in required columns df = df.dropna(subset=['avg_exec_time_ms', 'query_length', 'num_joins', 'has_subquery', 'uses_index']) # Define 'slow' query (threshold = 1000 ms) df['is_slow'] = df['avg_exec_time_ms'] > 1000 # Prepare feature matrix and target variable features = ['query_length', 'num_joins', 'has_subquery', 'uses_index'] X = df[features] y = df['is_slow'].astype(int) # Train the model model = RandomForestClassifier() model.fit(X, y) return model # 3. Provide Recommendations def recommend_tips(query): tips = [] if query['query_length'] > 800: tips.append("🔍 Query is long — consider breaking it into smaller chunks.") if query['num_joins'] > 3: tips.append("🪢 Too many JOINs — simplify joins or add proper indexing.") if query['has_subquery']: tips.append("🧠 Subquery detected — flatten subqueries if possible.") if not query['uses_index']: tips.append("⚡ Index not used — create indexes on filter/join columns.") if not tips: tips.append("✅ Query structure looks optimized.") return tips # 4. Streamlit App Interface def main(): #st.title("🚀 SQL Query Performance Predictor") st.markdown( "