| | import streamlit as st |
| | import pandas as pd |
| | from sklearn.ensemble import RandomForestClassifier |
| | from sklearn.model_selection import train_test_split |
| |
|
| | |
| | def load_data(): |
| | |
| | return pd.read_csv("data/sql_query_logs_2025_04_25.csv") |
| |
|
| | |
| | def preprocess_and_train_model(df): |
| | |
| | df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce') |
| |
|
| | |
| | for col in ['query_length', 'num_joins', 'has_subquery', 'uses_index']: |
| | df[col] = pd.to_numeric(df[col], errors='coerce') |
| |
|
| | |
| | df = df.dropna(subset=['avg_exec_time_ms', 'query_length', 'num_joins', 'has_subquery', 'uses_index']) |
| |
|
| | |
| | df['is_slow'] = df['avg_exec_time_ms'] > 1000 |
| |
|
| | |
| | features = ['query_length', 'num_joins', 'has_subquery', 'uses_index'] |
| | X = df[features] |
| | y = df['is_slow'].astype(int) |
| |
|
| | |
| | model = RandomForestClassifier() |
| | model.fit(X, y) |
| | return model |
| |
|
| | |
| | def recommend_tips(query): |
| | tips = [] |
| | if query['query_length'] > 800: |
| | tips.append("π Query is long β consider breaking it into smaller chunks.") |
| | if query['num_joins'] > 3: |
| | tips.append("πͺ’ Too many JOINs β simplify joins or add proper indexing.") |
| | if query['has_subquery']: |
| | tips.append("π§ Subquery detected β flatten subqueries if possible.") |
| | if not query['uses_index']: |
| | tips.append("β‘ Index not used β create indexes on filter/join columns.") |
| |
|
| | if not tips: |
| | tips.append("β
Query structure looks optimized.") |
| | return tips |
| |
|
| | |
| | def main(): |
| | |
| | st.markdown( |
| | "<h1 style='text-align: center; white-space: nowrap;'>πSQL Query Performance Predictor</h1>", |
| | unsafe_allow_html=True) |
| | |
| |
|
| |
|
| |
|
| | |
| | df = load_data() |
| |
|
| | st.subheader("π Query Logs Preview") |
| | st.write(df.head()) |
| |
|
| | |
| | model = preprocess_and_train_model(df) |
| |
|
| | |
| | st.subheader("π Enter Your SQL Query") |
| | query_text = st.text_area("SQL Query", height=150) |
| |
|
| | if query_text: |
| | |
| | query_length = len(query_text) |
| | num_joins = query_text.lower().count('join') |
| | has_subquery = 1 if 'select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1 else 0 |
| | uses_index = 1 if 'index' in query_text.lower() else 0 |
| |
|
| | query_features = pd.DataFrame({ |
| | 'query_length': [query_length], |
| | 'num_joins': [num_joins], |
| | 'has_subquery': [has_subquery], |
| | 'uses_index': [uses_index] |
| | }) |
| |
|
| | |
| | prediction = model.predict(query_features)[0] |
| |
|
| | if prediction == 1: |
| | st.error("π This query is likely to be **Slow**.") |
| | else: |
| | st.success("β
This query is likely to be **Fast**.") |
| |
|
| | |
| | st.subheader("π οΈ Optimization Tips") |
| | recommendations = recommend_tips(query_features.iloc[0]) |
| | for tip in recommendations: |
| | st.write(tip) |
| |
|
| | |
| | if __name__ == '__main__': |
| | main() |
| |
|