Spaces:

Yatheshr
/

Query-Performance-Predictor

Sleeping

File size: 3,523 Bytes

dd4821b
 
 
5944c9b
dd4821b
 
 
5944c9b
157002d
dd4821b
 
 
5944c9b
9c0f45b
 
5944c9b
 
 
 
 
 
 
 
dd4821b
9c0f45b
5944c9b
dd4821b
 
 
 
5944c9b
dd4821b
 
 
 
5944c9b
dd4821b
 
 
 
 
 
 
 
 
 
5944c9b
dd4821b
 
 
 
 
 
a0d03c2
0b846ee
 
 
 
 
a0d03c2
dd4821b
5944c9b
dd4821b
 
5944c9b
dd4821b
 
5944c9b
dd4821b
 
5944c9b
 
dd4821b
 
 
5944c9b
dd4821b
9c0f45b
5944c9b
 
9c0f45b
dd4821b
 
 
 
 
 
 
5944c9b
dd4821b
 
 
7412548

import streamlit as st
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# 1. Load Real SQL Query Logs
def load_data():
    # Make sure this filename matches the actual file in your "data/" directory
    return pd.read_csv("data/sql_query_logs_2025_04_25.csv")

# 2. Preprocess Data and Train the Model
def preprocess_and_train_model(df):
    # Ensure avg_exec_time_ms is numeric
    df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce')

    # Ensure feature columns are numeric
    for col in ['query_length', 'num_joins', 'has_subquery', 'uses_index']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop rows with any NaNs in required columns
    df = df.dropna(subset=['avg_exec_time_ms', 'query_length', 'num_joins', 'has_subquery', 'uses_index'])

    # Define 'slow' query (threshold = 1000 ms)
    df['is_slow'] = df['avg_exec_time_ms'] > 1000

    # Prepare feature matrix and target variable
    features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
    X = df[features]
    y = df['is_slow'].astype(int)

    # Train the model
    model = RandomForestClassifier()
    model.fit(X, y)
    return model

# 3. Provide Recommendations
def recommend_tips(query):
    tips = []
    if query['query_length'] > 800:
        tips.append("🔍 Query is long — consider breaking it into smaller chunks.")
    if query['num_joins'] > 3:
        tips.append("🪢 Too many JOINs — simplify joins or add proper indexing.")
    if query['has_subquery']:
        tips.append("🧠 Subquery detected — flatten subqueries if possible.")
    if not query['uses_index']:
        tips.append("⚡ Index not used — create indexes on filter/join columns.")

    if not tips:
        tips.append("✅ Query structure looks optimized.")
    return tips

# 4. Streamlit App Interface
def main():
    #st.title("🚀 SQL Query Performance Predictor")
    st.markdown(
    "<h1 style='text-align: center; white-space: nowrap;'>🚀SQL Query Performance Predictor</h1>",
    unsafe_allow_html=True)
        



    # Load data
    df = load_data()

    st.subheader("📊 Query Logs Preview")
    st.write(df.head())

    # Train model
    model = preprocess_and_train_model(df)

    # Input for user SQL query
    st.subheader("📝 Enter Your SQL Query")
    query_text = st.text_area("SQL Query", height=150)

    if query_text:
        # Extract features from user query
        query_length = len(query_text)
        num_joins = query_text.lower().count('join')
        has_subquery = 1 if 'select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1 else 0
        uses_index = 1 if 'index' in query_text.lower() else 0

        query_features = pd.DataFrame({
            'query_length': [query_length],
            'num_joins': [num_joins],
            'has_subquery': [has_subquery],
            'uses_index': [uses_index]
        })

        # Predict
        prediction = model.predict(query_features)[0]

        if prediction == 1:
            st.error("🛑 This query is likely to be **Slow**.")
        else:
            st.success("✅ This query is likely to be **Fast**.")

        # Recommendations
        st.subheader("🛠️ Optimization Tips")
        recommendations = recommend_tips(query_features.iloc[0])
        for tip in recommendations:
            st.write(tip)

# Run the app
if __name__ == '__main__':
    main()