import streamlit as st import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split # 1. Load Real SQL Query Logs def load_data(): # Make sure this filename matches the actual file in your "data/" directory return pd.read_csv("data/sql_query_logs_2025_04_25.csv") # 2. Preprocess Data and Train the Model def preprocess_and_train_model(df): # Ensure avg_exec_time_ms is numeric df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce') # Ensure feature columns are numeric for col in ['query_length', 'num_joins', 'has_subquery', 'uses_index']: df[col] = pd.to_numeric(df[col], errors='coerce') # Drop rows with any NaNs in required columns df = df.dropna(subset=['avg_exec_time_ms', 'query_length', 'num_joins', 'has_subquery', 'uses_index']) # Define 'slow' query (threshold = 1000 ms) df['is_slow'] = df['avg_exec_time_ms'] > 1000 # Prepare feature matrix and target variable features = ['query_length', 'num_joins', 'has_subquery', 'uses_index'] X = df[features] y = df['is_slow'].astype(int) # Train the model model = RandomForestClassifier() model.fit(X, y) return model # 3. Provide Recommendations def recommend_tips(query): tips = [] if query['query_length'] > 800: tips.append("🔍 Query is long — consider breaking it into smaller chunks.") if query['num_joins'] > 3: tips.append("🪢 Too many JOINs — simplify joins or add proper indexing.") if query['has_subquery']: tips.append("🧠 Subquery detected — flatten subqueries if possible.") if not query['uses_index']: tips.append("⚡ Index not used — create indexes on filter/join columns.") if not tips: tips.append("✅ Query structure looks optimized.") return tips # 4. Streamlit App Interface def main(): #st.title("🚀 SQL Query Performance Predictor") st.markdown( "

🚀SQL Query Performance Predictor

", unsafe_allow_html=True) # Load data df = load_data() st.subheader("📊 Query Logs Preview") st.write(df.head()) # Train model model = preprocess_and_train_model(df) # Input for user SQL query st.subheader("📝 Enter Your SQL Query") query_text = st.text_area("SQL Query", height=150) if query_text: # Extract features from user query query_length = len(query_text) num_joins = query_text.lower().count('join') has_subquery = 1 if 'select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1 else 0 uses_index = 1 if 'index' in query_text.lower() else 0 query_features = pd.DataFrame({ 'query_length': [query_length], 'num_joins': [num_joins], 'has_subquery': [has_subquery], 'uses_index': [uses_index] }) # Predict prediction = model.predict(query_features)[0] if prediction == 1: st.error("🛑 This query is likely to be **Slow**.") else: st.success("✅ This query is likely to be **Fast**.") # Recommendations st.subheader("🛠️ Optimization Tips") recommendations = recommend_tips(query_features.iloc[0]) for tip in recommendations: st.write(tip) # Run the app if __name__ == '__main__': main()