SVashishta1 commited on
Commit
2f13356
·
1 Parent(s): 77df513

Fix: Add direct handling for tip queries and make schema instructions more explicit

Browse files
Files changed (1) hide show
  1. app.py +49 -13
app.py CHANGED
@@ -68,17 +68,20 @@ current_plot = None
68
  query_prompt = ChatPromptTemplate.from_template("""
69
  You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
70
 
71
- Important guidelines:
72
- 1. MOST IMPORTANT: Only use columns that are explicitly provided in the context. Do not assume or invent columns.
73
- 2. Use SQLite syntax (not PostgreSQL or MySQL)
74
- 3. For date functions, use strftime() instead of EXTRACT
 
 
 
 
75
  - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
76
- 4. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
77
- 5. For percentiles, use window functions or approximate methods
78
- 6. Keep queries efficient and focused on answering the specific question
79
- 7. Always use 'data_tab' as the table name
80
- 8. IMPORTANT: Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
81
- 9. If the question seems to require a column that isn't provided, use the most relevant existing column instead
82
 
83
  Question: {question}
84
  """)
@@ -118,7 +121,13 @@ Visualization type: {viz_type}
118
  # Define the prompt for interpreting the SQL query result
119
  interpret_prompt = ChatPromptTemplate.from_messages(
120
  [
121
- ("system", "You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary. If relevant, give key statistics, trends, or patterns."),
 
 
 
 
 
 
122
  ("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
123
  ]
124
  )
@@ -209,7 +218,7 @@ def process_text_query(query, history):
209
  # Connect to the database
210
  conn = sqlite3.connect(DB_PATH)
211
 
212
- # Get column information for context
213
  cursor = conn.cursor()
214
  cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
215
  columns_info = cursor.fetchall()
@@ -220,6 +229,27 @@ def process_text_query(query, history):
220
  columns_with_types = [f"{col} ({typ})" for col, typ in zip(columns, column_types)]
221
  columns_str = ", ".join(columns_with_types)
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  # Create sample data context
224
  sample_query = "SELECT * FROM data_tab LIMIT 3;"
225
  sample_df = pd.read_sql_query(sample_query, conn)
@@ -227,13 +257,19 @@ def process_text_query(query, history):
227
 
228
  # Create question with detailed context
229
  question_with_context = f"""
230
- The table 'data_tab' has the following columns with their types:
 
 
231
  {columns_str}
232
 
 
 
233
  Here's a sample of the data:
234
  {sample_data}
235
 
236
  User question: {query}
 
 
237
  """
238
 
239
  # Special handling for visualization types that need raw data
 
68
  query_prompt = ChatPromptTemplate.from_template("""
69
  You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
70
 
71
+ CRITICAL RULES:
72
+ 1. ONLY use columns that are EXPLICITLY provided in the context. DO NOT invent or assume columns exist if they are not listed.
73
+ 2. If the user asks about a column that doesn't exist, use a similar column from the available ones or explain that the data doesn't contain that information.
74
+ 3. ALWAYS double-check that every column in your query is in the list of available columns.
75
+
76
+ Technical guidelines:
77
+ 4. Use SQLite syntax (not PostgreSQL or MySQL)
78
+ 5. For date functions, use strftime() instead of EXTRACT
79
  - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
80
+ 6. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
81
+ 7. For percentiles, use window functions or approximate methods
82
+ 8. Keep queries efficient and focused on answering the specific question
83
+ 9. Always use 'data_tab' as the table name
84
+ 10. Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
 
85
 
86
  Question: {question}
87
  """)
 
121
  # Define the prompt for interpreting the SQL query result
122
  interpret_prompt = ChatPromptTemplate.from_messages(
123
  [
124
+ ("system", """You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary.
125
+
126
+ If relevant, give key statistics, trends, or patterns. Be clear about what the data shows and doesn't show.
127
+
128
+ If the SQL query had to use alternative columns because the exact ones requested weren't available, explain this clearly to the user.
129
+
130
+ For example, if they asked about 'fare_amount' but the dataset has 'fare' or 'total_fare' instead, mention this substitution."""),
131
  ("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
132
  ]
133
  )
 
218
  # Connect to the database
219
  conn = sqlite3.connect(DB_PATH)
220
 
221
+ # Get schema information FIRST before doing anything else
222
  cursor = conn.cursor()
223
  cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
224
  columns_info = cursor.fetchall()
 
229
  columns_with_types = [f"{col} ({typ})" for col, typ in zip(columns, column_types)]
230
  columns_str = ", ".join(columns_with_types)
231
 
232
+ # Handle specific queries directly based on schema
233
+ if "highest tip" in query.lower() or "largest tip" in query.lower() or "maximum tip" in query.lower():
234
+ # Look for tip-related columns
235
+ tip_columns = [col for col in columns if "tip" in col.lower() or "gratuity" in col.lower()]
236
+ if tip_columns:
237
+ print(f"Found tip-related columns: {tip_columns}")
238
+ sql_query = f"SELECT MAX({tip_columns[0]}) AS highest_tip FROM data_tab"
239
+
240
+ # Execute the query directly
241
+ result_df = pd.read_sql_query(sql_query, conn)
242
+
243
+ # Generate response
244
+ highest_tip = result_df.iloc[0, 0]
245
+ response = f"The highest tip in the dataset is {highest_tip}."
246
+ history[-1][1] = response
247
+ return response, history
248
+ else:
249
+ response = f"I couldn't find any columns related to tips in the dataset. Available columns are: {', '.join(columns)}"
250
+ history[-1][1] = response
251
+ return response, history
252
+
253
  # Create sample data context
254
  sample_query = "SELECT * FROM data_tab LIMIT 3;"
255
  sample_df = pd.read_sql_query(sample_query, conn)
 
257
 
258
  # Create question with detailed context
259
  question_with_context = f"""
260
+ IMPORTANT: ONLY use the exact columns listed below. DO NOT use any columns not explicitly listed here.
261
+
262
+ The table 'data_tab' has these columns with their types:
263
  {columns_str}
264
 
265
+ Available columns (exact names): {', '.join(columns)}
266
+
267
  Here's a sample of the data:
268
  {sample_data}
269
 
270
  User question: {query}
271
+
272
+ Remember to ONLY use the columns listed above. If the question seems to require a column that doesn't exist, use the most relevant existing column instead or explain that the data doesn't contain that information.
273
  """
274
 
275
  # Special handling for visualization types that need raw data