File size: 3,523 Bytes
dd4821b 5944c9b dd4821b 5944c9b 157002d dd4821b 5944c9b 9c0f45b 5944c9b dd4821b 9c0f45b 5944c9b dd4821b 5944c9b dd4821b 5944c9b dd4821b 5944c9b dd4821b a0d03c2 0b846ee a0d03c2 dd4821b 5944c9b dd4821b 5944c9b dd4821b 5944c9b dd4821b 5944c9b dd4821b 5944c9b dd4821b 9c0f45b 5944c9b 9c0f45b dd4821b 5944c9b dd4821b 7412548 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | import streamlit as st
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# 1. Load Real SQL Query Logs
def load_data():
# Make sure this filename matches the actual file in your "data/" directory
return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
# 2. Preprocess Data and Train the Model
def preprocess_and_train_model(df):
# Ensure avg_exec_time_ms is numeric
df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce')
# Ensure feature columns are numeric
for col in ['query_length', 'num_joins', 'has_subquery', 'uses_index']:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Drop rows with any NaNs in required columns
df = df.dropna(subset=['avg_exec_time_ms', 'query_length', 'num_joins', 'has_subquery', 'uses_index'])
# Define 'slow' query (threshold = 1000 ms)
df['is_slow'] = df['avg_exec_time_ms'] > 1000
# Prepare feature matrix and target variable
features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
X = df[features]
y = df['is_slow'].astype(int)
# Train the model
model = RandomForestClassifier()
model.fit(X, y)
return model
# 3. Provide Recommendations
def recommend_tips(query):
tips = []
if query['query_length'] > 800:
tips.append("π Query is long β consider breaking it into smaller chunks.")
if query['num_joins'] > 3:
tips.append("πͺ’ Too many JOINs β simplify joins or add proper indexing.")
if query['has_subquery']:
tips.append("π§ Subquery detected β flatten subqueries if possible.")
if not query['uses_index']:
tips.append("β‘ Index not used β create indexes on filter/join columns.")
if not tips:
tips.append("β
Query structure looks optimized.")
return tips
# 4. Streamlit App Interface
def main():
#st.title("π SQL Query Performance Predictor")
st.markdown(
"<h1 style='text-align: center; white-space: nowrap;'>πSQL Query Performance Predictor</h1>",
unsafe_allow_html=True)
# Load data
df = load_data()
st.subheader("π Query Logs Preview")
st.write(df.head())
# Train model
model = preprocess_and_train_model(df)
# Input for user SQL query
st.subheader("π Enter Your SQL Query")
query_text = st.text_area("SQL Query", height=150)
if query_text:
# Extract features from user query
query_length = len(query_text)
num_joins = query_text.lower().count('join')
has_subquery = 1 if 'select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1 else 0
uses_index = 1 if 'index' in query_text.lower() else 0
query_features = pd.DataFrame({
'query_length': [query_length],
'num_joins': [num_joins],
'has_subquery': [has_subquery],
'uses_index': [uses_index]
})
# Predict
prediction = model.predict(query_features)[0]
if prediction == 1:
st.error("π This query is likely to be **Slow**.")
else:
st.success("β
This query is likely to be **Fast**.")
# Recommendations
st.subheader("π οΈ Optimization Tips")
recommendations = recommend_tips(query_features.iloc[0])
for tip in recommendations:
st.write(tip)
# Run the app
if __name__ == '__main__':
main()
|