Spaces:

Yatheshr
/

Query-Performance-Predictor

Sleeping

App Files Files Community

Yatheshr commited on Apr 25, 2025

Commit

5944c9b

verified ·

1 Parent(s): 9c0f45b

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -24

app.py CHANGED Viewed

@@ -1,30 +1,39 @@
 import streamlit as st
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 # 1. Load Real SQL Query Logs
 def load_data():
-    # Make sure the file path matches your HF Space structure
     return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
 # 2. Preprocess Data and Train the Model
 def preprocess_and_train_model(df):
-    # Convert avg_exec_time_ms to numeric in case it's string
     df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce')
-    df = df.dropna(subset=['avg_exec_time_ms'])  # Remove rows with invalid/missing exec time
-    # Define 'slow' query threshold
     df['is_slow'] = df['avg_exec_time_ms'] > 1000
     features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
     X = df[features]
     y = df['is_slow'].astype(int)
     model = RandomForestClassifier()
     model.fit(X, y)
     return model
-# 3. Recommendation Engine
 def recommend_tips(query):
     tips = []
     if query['query_length'] > 800:
@@ -35,32 +44,34 @@ def recommend_tips(query):
         tips.append("🧠 Subquery detected — flatten subqueries if possible.")
     if not query['uses_index']:
         tips.append("⚡ Index not used — create indexes on filter/join columns.")
     if not tips:
         tips.append("✅ Query structure looks optimized.")
     return tips
 # 4. Streamlit App Interface
 def main():
-    st.title("SQL Query Performance Predictor")
-    # Step 1: Load Data
     df = load_data()
-    st.subheader("Query Logs Preview")
     st.write(df.head())
-    # Step 2: Train Model
     model = preprocess_and_train_model(df)
-    # Step 3: User Query Input
-    st.subheader("Enter Your SQL Query")
     query_text = st.text_area("SQL Query", height=150)
     if query_text:
         query_length = len(query_text)
         num_joins = query_text.lower().count('join')
-        has_subquery = int('select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1)
-        uses_index = int('index' in query_text.lower())
         query_features = pd.DataFrame({
             'query_length': [query_length],
@@ -69,17 +80,8 @@ def main():
             'uses_index': [uses_index]
         })
         prediction = model.predict(query_features)[0]
         if prediction == 1:
-            st.error("🛑 This query is likely to be **Slow**.")
-        else:
-            st.success("✅ This query is likely to be **Fast**.")
-        st.subheader("🛠️ Optimization Tips")
-        for tip in recommend_tips(query_features.iloc[0]):
-            st.write(tip)
-# Run the app
-if __name__ == '__main__':
-    main()

 import streamlit as st
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
 # 1. Load Real SQL Query Logs
 def load_data():
+    # Make sure this filename matches the actual file in your "data/" directory
     return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
 # 2. Preprocess Data and Train the Model
 def preprocess_and_train_model(df):
+    # Ensure avg_exec_time_ms is numeric
     df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce')
+    # Ensure feature columns are numeric
+    for col in ['query_length', 'num_joins', 'has_subquery', 'uses_index']:
+        df[col] = pd.to_numeric(df[col], errors='coerce')
+    # Drop rows with any NaNs in required columns
+    df = df.dropna(subset=['avg_exec_time_ms', 'query_length', 'num_joins', 'has_subquery', 'uses_index'])
+    # Define 'slow' query (threshold = 1000 ms)
     df['is_slow'] = df['avg_exec_time_ms'] > 1000
+    # Prepare feature matrix and target variable
     features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
     X = df[features]
     y = df['is_slow'].astype(int)
+    # Train the model
     model = RandomForestClassifier()
     model.fit(X, y)
     return model
+# 3. Provide Recommendations
 def recommend_tips(query):
     tips = []
     if query['query_length'] > 800:
         tips.append("🧠 Subquery detected — flatten subqueries if possible.")
     if not query['uses_index']:
         tips.append("⚡ Index not used — create indexes on filter/join columns.")
     if not tips:
         tips.append("✅ Query structure looks optimized.")
     return tips
 # 4. Streamlit App Interface
 def main():
+    st.title("🚀 SQL Query Performance Predictor")
+    # Load data
     df = load_data()
+    st.subheader("📊 Query Logs Preview")
     st.write(df.head())
+    # Train model
     model = preprocess_and_train_model(df)
+    # Input for user SQL query
+    st.subheader("📝 Enter Your SQL Query")
     query_text = st.text_area("SQL Query", height=150)
     if query_text:
+        # Extract features from user query
         query_length = len(query_text)
         num_joins = query_text.lower().count('join')
+        has_subquery = 1 if 'select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1 else 0
+        uses_index = 1 if 'index' in query_text.lower() else 0
         query_features = pd.DataFrame({
             'query_length': [query_length],
             'uses_index': [uses_index]
         })
+        # Predict
         prediction = model.predict(query_features)[0]
         if prediction == 1:
+            st.error("🛑 This query is likely to be **Slow**