Spaces:

Yatheshr
/

Query-Performance-Predictor

Sleeping

App Files Files Community

Yatheshr commited on Apr 25, 2025

Commit

9c0f45b

verified ·

1 Parent(s): 157002d

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -25

app.py CHANGED Viewed

@@ -1,29 +1,30 @@
 import streamlit as st
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
 # 1. Load Real SQL Query Logs
-# Load query logs from CSV (use your own CSV file here)
-#@st.cache_data
 def load_data():
-    # Make sure the CSV file is located correctly in the Hugging Face Space
     return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
 # 2. Preprocess Data and Train the Model
 def preprocess_and_train_model(df):
-    # Define 'slow' query threshold (avg_exec_time_ms > 1000 ms)
     df['is_slow'] = df['avg_exec_time_ms'] > 1000
     features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
     X = df[features]
     y = df['is_slow'].astype(int)
-    # Train a RandomForest model
     model = RandomForestClassifier()
     model.fit(X, y)
     return model
-# 3. Add a Recommendation Engine
 def recommend_tips(query):
     tips = []
     if query['query_length'] > 800:
@@ -34,7 +35,6 @@ def recommend_tips(query):
         tips.append("🧠 Subquery detected — flatten subqueries if possible.")
     if not query['uses_index']:
         tips.append("⚡ Index not used — create indexes on filter/join columns.")
     if not tips:
         tips.append("✅ Query structure looks optimized.")
     return tips
@@ -43,29 +43,25 @@ def recommend_tips(query):
 def main():
     st.title("SQL Query Performance Predictor")
-    # Step 1: Load the Data
     df = load_data()
-    # Display a preview of the data
     st.subheader("Query Logs Preview")
     st.write(df.head())
-    # Step 2: Train the Model
     model = preprocess_and_train_model(df)
-    # Step 3: User Input for Query Analysis
     st.subheader("Enter Your SQL Query")
     query_text = st.text_area("SQL Query", height=150)
     if query_text:
-        # Process the query to extract features
         query_length = len(query_text)
-        num_joins = (query_text.lower().count('join') // 4)  # Approximation
-        has_subquery = 1 if 'select' in query_text.lower() and 'from' in query_text.lower() and 'select' in query_text.lower() else 0
-        # Dummy logic to determine if an index is used — you can extend this logic with actual parsing
-        uses_index = 1 if "index" in query_text.lower() else 0
         query_features = pd.DataFrame({
             'query_length': [query_length],
             'num_joins': [num_joins],
@@ -73,21 +69,17 @@ def main():
             'uses_index': [uses_index]
         })
-        # Step 4: Prediction
         prediction = model.predict(query_features)[0]
-        # Show result
         if prediction == 1:
             st.error("🛑 This query is likely to be **Slow**.")
         else:
             st.success("✅ This query is likely to be **Fast**.")
-        # Show optimization recommendations
         st.subheader("🛠️ Optimization Tips")
-        recommendations = recommend_tips(query_features.iloc[0])
-        for tip in recommendations:
             st.write(tip)
-# Run the Streamlit app
 if __name__ == '__main__':
     main()

 import streamlit as st
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 # 1. Load Real SQL Query Logs
 def load_data():
+    # Make sure the file path matches your HF Space structure
     return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
 # 2. Preprocess Data and Train the Model
 def preprocess_and_train_model(df):
+    # Convert avg_exec_time_ms to numeric in case it's string
+    df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce')
+    df = df.dropna(subset=['avg_exec_time_ms'])  # Remove rows with invalid/missing exec time
+    # Define 'slow' query threshold
     df['is_slow'] = df['avg_exec_time_ms'] > 1000
     features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
     X = df[features]
     y = df['is_slow'].astype(int)
     model = RandomForestClassifier()
     model.fit(X, y)
     return model
+# 3. Recommendation Engine
 def recommend_tips(query):
     tips = []
     if query['query_length'] > 800:
         tips.append("🧠 Subquery detected — flatten subqueries if possible.")
     if not query['uses_index']:
         tips.append("⚡ Index not used — create indexes on filter/join columns.")
     if not tips:
         tips.append("✅ Query structure looks optimized.")
     return tips
 def main():
     st.title("SQL Query Performance Predictor")
+    # Step 1: Load Data
     df = load_data()
     st.subheader("Query Logs Preview")
     st.write(df.head())
+    # Step 2: Train Model
     model = preprocess_and_train_model(df)
+    # Step 3: User Query Input
     st.subheader("Enter Your SQL Query")
     query_text = st.text_area("SQL Query", height=150)
     if query_text:
         query_length = len(query_text)
+        num_joins = query_text.lower().count('join')
+        has_subquery = int('select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1)
+        uses_index = int('index' in query_text.lower())
         query_features = pd.DataFrame({
             'query_length': [query_length],
             'num_joins': [num_joins],
             'uses_index': [uses_index]
         })
         prediction = model.predict(query_features)[0]
         if prediction == 1:
             st.error("🛑 This query is likely to be **Slow**.")
         else:
             st.success("✅ This query is likely to be **Fast**.")
         st.subheader("🛠️ Optimization Tips")
+        for tip in recommend_tips(query_features.iloc[0]):
             st.write(tip)
+# Run the app
 if __name__ == '__main__':
     main()