Yatheshr commited on
Commit
5944c9b
Β·
verified Β·
1 Parent(s): 9c0f45b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -24
app.py CHANGED
@@ -1,30 +1,39 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from sklearn.ensemble import RandomForestClassifier
 
4
 
5
  # 1. Load Real SQL Query Logs
6
  def load_data():
7
- # Make sure the file path matches your HF Space structure
8
  return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
9
 
10
  # 2. Preprocess Data and Train the Model
11
  def preprocess_and_train_model(df):
12
- # Convert avg_exec_time_ms to numeric in case it's string
13
  df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce')
14
- df = df.dropna(subset=['avg_exec_time_ms']) # Remove rows with invalid/missing exec time
15
 
16
- # Define 'slow' query threshold
 
 
 
 
 
 
 
17
  df['is_slow'] = df['avg_exec_time_ms'] > 1000
18
 
 
19
  features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
20
  X = df[features]
21
  y = df['is_slow'].astype(int)
22
 
 
23
  model = RandomForestClassifier()
24
  model.fit(X, y)
25
  return model
26
 
27
- # 3. Recommendation Engine
28
  def recommend_tips(query):
29
  tips = []
30
  if query['query_length'] > 800:
@@ -35,32 +44,34 @@ def recommend_tips(query):
35
  tips.append("🧠 Subquery detected β€” flatten subqueries if possible.")
36
  if not query['uses_index']:
37
  tips.append("⚑ Index not used β€” create indexes on filter/join columns.")
 
38
  if not tips:
39
  tips.append("βœ… Query structure looks optimized.")
40
  return tips
41
 
42
  # 4. Streamlit App Interface
43
  def main():
44
- st.title("SQL Query Performance Predictor")
45
 
46
- # Step 1: Load Data
47
  df = load_data()
48
 
49
- st.subheader("Query Logs Preview")
50
  st.write(df.head())
51
 
52
- # Step 2: Train Model
53
  model = preprocess_and_train_model(df)
54
 
55
- # Step 3: User Query Input
56
- st.subheader("Enter Your SQL Query")
57
  query_text = st.text_area("SQL Query", height=150)
58
 
59
  if query_text:
 
60
  query_length = len(query_text)
61
  num_joins = query_text.lower().count('join')
62
- has_subquery = int('select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1)
63
- uses_index = int('index' in query_text.lower())
64
 
65
  query_features = pd.DataFrame({
66
  'query_length': [query_length],
@@ -69,17 +80,8 @@ def main():
69
  'uses_index': [uses_index]
70
  })
71
 
 
72
  prediction = model.predict(query_features)[0]
73
 
74
  if prediction == 1:
75
- st.error("πŸ›‘ This query is likely to be **Slow**.")
76
- else:
77
- st.success("βœ… This query is likely to be **Fast**.")
78
-
79
- st.subheader("πŸ› οΈ Optimization Tips")
80
- for tip in recommend_tips(query_features.iloc[0]):
81
- st.write(tip)
82
-
83
- # Run the app
84
- if __name__ == '__main__':
85
- main()
 
1
  import streamlit as st
2
  import pandas as pd
3
  from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.model_selection import train_test_split
5
 
6
  # 1. Load Real SQL Query Logs
7
  def load_data():
8
+ # Make sure this filename matches the actual file in your "data/" directory
9
  return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
10
 
11
  # 2. Preprocess Data and Train the Model
12
  def preprocess_and_train_model(df):
13
+ # Ensure avg_exec_time_ms is numeric
14
  df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce')
 
15
 
16
+ # Ensure feature columns are numeric
17
+ for col in ['query_length', 'num_joins', 'has_subquery', 'uses_index']:
18
+ df[col] = pd.to_numeric(df[col], errors='coerce')
19
+
20
+ # Drop rows with any NaNs in required columns
21
+ df = df.dropna(subset=['avg_exec_time_ms', 'query_length', 'num_joins', 'has_subquery', 'uses_index'])
22
+
23
+ # Define 'slow' query (threshold = 1000 ms)
24
  df['is_slow'] = df['avg_exec_time_ms'] > 1000
25
 
26
+ # Prepare feature matrix and target variable
27
  features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
28
  X = df[features]
29
  y = df['is_slow'].astype(int)
30
 
31
+ # Train the model
32
  model = RandomForestClassifier()
33
  model.fit(X, y)
34
  return model
35
 
36
+ # 3. Provide Recommendations
37
  def recommend_tips(query):
38
  tips = []
39
  if query['query_length'] > 800:
 
44
  tips.append("🧠 Subquery detected β€” flatten subqueries if possible.")
45
  if not query['uses_index']:
46
  tips.append("⚑ Index not used β€” create indexes on filter/join columns.")
47
+
48
  if not tips:
49
  tips.append("βœ… Query structure looks optimized.")
50
  return tips
51
 
52
  # 4. Streamlit App Interface
53
  def main():
54
+ st.title("πŸš€ SQL Query Performance Predictor")
55
 
56
+ # Load data
57
  df = load_data()
58
 
59
+ st.subheader("πŸ“Š Query Logs Preview")
60
  st.write(df.head())
61
 
62
+ # Train model
63
  model = preprocess_and_train_model(df)
64
 
65
+ # Input for user SQL query
66
+ st.subheader("πŸ“ Enter Your SQL Query")
67
  query_text = st.text_area("SQL Query", height=150)
68
 
69
  if query_text:
70
+ # Extract features from user query
71
  query_length = len(query_text)
72
  num_joins = query_text.lower().count('join')
73
+ has_subquery = 1 if 'select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1 else 0
74
+ uses_index = 1 if 'index' in query_text.lower() else 0
75
 
76
  query_features = pd.DataFrame({
77
  'query_length': [query_length],
 
80
  'uses_index': [uses_index]
81
  })
82
 
83
+ # Predict
84
  prediction = model.predict(query_features)[0]
85
 
86
  if prediction == 1:
87
+ st.error("πŸ›‘ This query is likely to be **Slow**