Yatheshr commited on
Commit
9c0f45b
Β·
verified Β·
1 Parent(s): 157002d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -25
app.py CHANGED
@@ -1,29 +1,30 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from sklearn.ensemble import RandomForestClassifier
4
- from sklearn.model_selection import train_test_split
5
 
6
  # 1. Load Real SQL Query Logs
7
- # Load query logs from CSV (use your own CSV file here)
8
- #@st.cache_data
9
  def load_data():
10
- # Make sure the CSV file is located correctly in the Hugging Face Space
11
  return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
12
 
13
  # 2. Preprocess Data and Train the Model
14
  def preprocess_and_train_model(df):
15
- # Define 'slow' query threshold (avg_exec_time_ms > 1000 ms)
 
 
 
 
16
  df['is_slow'] = df['avg_exec_time_ms'] > 1000
 
17
  features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
18
  X = df[features]
19
  y = df['is_slow'].astype(int)
20
 
21
- # Train a RandomForest model
22
  model = RandomForestClassifier()
23
  model.fit(X, y)
24
  return model
25
 
26
- # 3. Add a Recommendation Engine
27
  def recommend_tips(query):
28
  tips = []
29
  if query['query_length'] > 800:
@@ -34,7 +35,6 @@ def recommend_tips(query):
34
  tips.append("🧠 Subquery detected β€” flatten subqueries if possible.")
35
  if not query['uses_index']:
36
  tips.append("⚑ Index not used β€” create indexes on filter/join columns.")
37
-
38
  if not tips:
39
  tips.append("βœ… Query structure looks optimized.")
40
  return tips
@@ -43,29 +43,25 @@ def recommend_tips(query):
43
  def main():
44
  st.title("SQL Query Performance Predictor")
45
 
46
- # Step 1: Load the Data
47
  df = load_data()
48
 
49
- # Display a preview of the data
50
  st.subheader("Query Logs Preview")
51
  st.write(df.head())
52
 
53
- # Step 2: Train the Model
54
  model = preprocess_and_train_model(df)
55
 
56
- # Step 3: User Input for Query Analysis
57
  st.subheader("Enter Your SQL Query")
58
  query_text = st.text_area("SQL Query", height=150)
59
 
60
  if query_text:
61
- # Process the query to extract features
62
  query_length = len(query_text)
63
- num_joins = (query_text.lower().count('join') // 4) # Approximation
64
- has_subquery = 1 if 'select' in query_text.lower() and 'from' in query_text.lower() and 'select' in query_text.lower() else 0
65
-
66
- # Dummy logic to determine if an index is used β€” you can extend this logic with actual parsing
67
- uses_index = 1 if "index" in query_text.lower() else 0
68
-
69
  query_features = pd.DataFrame({
70
  'query_length': [query_length],
71
  'num_joins': [num_joins],
@@ -73,21 +69,17 @@ def main():
73
  'uses_index': [uses_index]
74
  })
75
 
76
- # Step 4: Prediction
77
  prediction = model.predict(query_features)[0]
78
 
79
- # Show result
80
  if prediction == 1:
81
  st.error("πŸ›‘ This query is likely to be **Slow**.")
82
  else:
83
  st.success("βœ… This query is likely to be **Fast**.")
84
 
85
- # Show optimization recommendations
86
  st.subheader("πŸ› οΈ Optimization Tips")
87
- recommendations = recommend_tips(query_features.iloc[0])
88
- for tip in recommendations:
89
  st.write(tip)
90
 
91
- # Run the Streamlit app
92
  if __name__ == '__main__':
93
  main()
 
1
  import streamlit as st
2
  import pandas as pd
3
  from sklearn.ensemble import RandomForestClassifier
 
4
 
5
  # 1. Load Real SQL Query Logs
 
 
6
  def load_data():
7
+ # Make sure the file path matches your HF Space structure
8
  return pd.read_csv("data/sql_query_logs_2025_04_25.csv")
9
 
10
  # 2. Preprocess Data and Train the Model
11
  def preprocess_and_train_model(df):
12
+ # Convert avg_exec_time_ms to numeric in case it's string
13
+ df['avg_exec_time_ms'] = pd.to_numeric(df['avg_exec_time_ms'], errors='coerce')
14
+ df = df.dropna(subset=['avg_exec_time_ms']) # Remove rows with invalid/missing exec time
15
+
16
+ # Define 'slow' query threshold
17
  df['is_slow'] = df['avg_exec_time_ms'] > 1000
18
+
19
  features = ['query_length', 'num_joins', 'has_subquery', 'uses_index']
20
  X = df[features]
21
  y = df['is_slow'].astype(int)
22
 
 
23
  model = RandomForestClassifier()
24
  model.fit(X, y)
25
  return model
26
 
27
+ # 3. Recommendation Engine
28
  def recommend_tips(query):
29
  tips = []
30
  if query['query_length'] > 800:
 
35
  tips.append("🧠 Subquery detected β€” flatten subqueries if possible.")
36
  if not query['uses_index']:
37
  tips.append("⚑ Index not used β€” create indexes on filter/join columns.")
 
38
  if not tips:
39
  tips.append("βœ… Query structure looks optimized.")
40
  return tips
 
43
  def main():
44
  st.title("SQL Query Performance Predictor")
45
 
46
+ # Step 1: Load Data
47
  df = load_data()
48
 
 
49
  st.subheader("Query Logs Preview")
50
  st.write(df.head())
51
 
52
+ # Step 2: Train Model
53
  model = preprocess_and_train_model(df)
54
 
55
+ # Step 3: User Query Input
56
  st.subheader("Enter Your SQL Query")
57
  query_text = st.text_area("SQL Query", height=150)
58
 
59
  if query_text:
 
60
  query_length = len(query_text)
61
+ num_joins = query_text.lower().count('join')
62
+ has_subquery = int('select' in query_text.lower() and 'from' in query_text.lower() and query_text.lower().count('select') > 1)
63
+ uses_index = int('index' in query_text.lower())
64
+
 
 
65
  query_features = pd.DataFrame({
66
  'query_length': [query_length],
67
  'num_joins': [num_joins],
 
69
  'uses_index': [uses_index]
70
  })
71
 
 
72
  prediction = model.predict(query_features)[0]
73
 
 
74
  if prediction == 1:
75
  st.error("πŸ›‘ This query is likely to be **Slow**.")
76
  else:
77
  st.success("βœ… This query is likely to be **Fast**.")
78
 
 
79
  st.subheader("πŸ› οΈ Optimization Tips")
80
+ for tip in recommend_tips(query_features.iloc[0]):
 
81
  st.write(tip)
82
 
83
+ # Run the app
84
  if __name__ == '__main__':
85
  main()