SVashishta1
commited on
Commit
·
2f13356
1
Parent(s):
77df513
Fix: Add direct handling for tip queries and make schema instructions more explicit
Browse files
app.py
CHANGED
|
@@ -68,17 +68,20 @@ current_plot = None
|
|
| 68 |
query_prompt = ChatPromptTemplate.from_template("""
|
| 69 |
You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
|
| 70 |
|
| 71 |
-
|
| 72 |
-
1.
|
| 73 |
-
2.
|
| 74 |
-
3.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
- Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
9. If the question seems to require a column that isn't provided, use the most relevant existing column instead
|
| 82 |
|
| 83 |
Question: {question}
|
| 84 |
""")
|
|
@@ -118,7 +121,13 @@ Visualization type: {viz_type}
|
|
| 118 |
# Define the prompt for interpreting the SQL query result
|
| 119 |
interpret_prompt = ChatPromptTemplate.from_messages(
|
| 120 |
[
|
| 121 |
-
("system", "You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
|
| 123 |
]
|
| 124 |
)
|
|
@@ -209,7 +218,7 @@ def process_text_query(query, history):
|
|
| 209 |
# Connect to the database
|
| 210 |
conn = sqlite3.connect(DB_PATH)
|
| 211 |
|
| 212 |
-
# Get
|
| 213 |
cursor = conn.cursor()
|
| 214 |
cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
|
| 215 |
columns_info = cursor.fetchall()
|
|
@@ -220,6 +229,27 @@ def process_text_query(query, history):
|
|
| 220 |
columns_with_types = [f"{col} ({typ})" for col, typ in zip(columns, column_types)]
|
| 221 |
columns_str = ", ".join(columns_with_types)
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
# Create sample data context
|
| 224 |
sample_query = "SELECT * FROM data_tab LIMIT 3;"
|
| 225 |
sample_df = pd.read_sql_query(sample_query, conn)
|
|
@@ -227,13 +257,19 @@ def process_text_query(query, history):
|
|
| 227 |
|
| 228 |
# Create question with detailed context
|
| 229 |
question_with_context = f"""
|
| 230 |
-
|
|
|
|
|
|
|
| 231 |
{columns_str}
|
| 232 |
|
|
|
|
|
|
|
| 233 |
Here's a sample of the data:
|
| 234 |
{sample_data}
|
| 235 |
|
| 236 |
User question: {query}
|
|
|
|
|
|
|
| 237 |
"""
|
| 238 |
|
| 239 |
# Special handling for visualization types that need raw data
|
|
|
|
| 68 |
query_prompt = ChatPromptTemplate.from_template("""
|
| 69 |
You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
|
| 70 |
|
| 71 |
+
CRITICAL RULES:
|
| 72 |
+
1. ONLY use columns that are EXPLICITLY provided in the context. DO NOT invent or assume columns exist if they are not listed.
|
| 73 |
+
2. If the user asks about a column that doesn't exist, use a similar column from the available ones or explain that the data doesn't contain that information.
|
| 74 |
+
3. ALWAYS double-check that every column in your query is in the list of available columns.
|
| 75 |
+
|
| 76 |
+
Technical guidelines:
|
| 77 |
+
4. Use SQLite syntax (not PostgreSQL or MySQL)
|
| 78 |
+
5. For date functions, use strftime() instead of EXTRACT
|
| 79 |
- Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
|
| 80 |
+
6. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
|
| 81 |
+
7. For percentiles, use window functions or approximate methods
|
| 82 |
+
8. Keep queries efficient and focused on answering the specific question
|
| 83 |
+
9. Always use 'data_tab' as the table name
|
| 84 |
+
10. Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
|
|
|
|
| 85 |
|
| 86 |
Question: {question}
|
| 87 |
""")
|
|
|
|
| 121 |
# Define the prompt for interpreting the SQL query result
|
| 122 |
interpret_prompt = ChatPromptTemplate.from_messages(
|
| 123 |
[
|
| 124 |
+
("system", """You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary.
|
| 125 |
+
|
| 126 |
+
If relevant, give key statistics, trends, or patterns. Be clear about what the data shows and doesn't show.
|
| 127 |
+
|
| 128 |
+
If the SQL query had to use alternative columns because the exact ones requested weren't available, explain this clearly to the user.
|
| 129 |
+
|
| 130 |
+
For example, if they asked about 'fare_amount' but the dataset has 'fare' or 'total_fare' instead, mention this substitution."""),
|
| 131 |
("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
|
| 132 |
]
|
| 133 |
)
|
|
|
|
| 218 |
# Connect to the database
|
| 219 |
conn = sqlite3.connect(DB_PATH)
|
| 220 |
|
| 221 |
+
# Get schema information FIRST before doing anything else
|
| 222 |
cursor = conn.cursor()
|
| 223 |
cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
|
| 224 |
columns_info = cursor.fetchall()
|
|
|
|
| 229 |
columns_with_types = [f"{col} ({typ})" for col, typ in zip(columns, column_types)]
|
| 230 |
columns_str = ", ".join(columns_with_types)
|
| 231 |
|
| 232 |
+
# Handle specific queries directly based on schema
|
| 233 |
+
if "highest tip" in query.lower() or "largest tip" in query.lower() or "maximum tip" in query.lower():
|
| 234 |
+
# Look for tip-related columns
|
| 235 |
+
tip_columns = [col for col in columns if "tip" in col.lower() or "gratuity" in col.lower()]
|
| 236 |
+
if tip_columns:
|
| 237 |
+
print(f"Found tip-related columns: {tip_columns}")
|
| 238 |
+
sql_query = f"SELECT MAX({tip_columns[0]}) AS highest_tip FROM data_tab"
|
| 239 |
+
|
| 240 |
+
# Execute the query directly
|
| 241 |
+
result_df = pd.read_sql_query(sql_query, conn)
|
| 242 |
+
|
| 243 |
+
# Generate response
|
| 244 |
+
highest_tip = result_df.iloc[0, 0]
|
| 245 |
+
response = f"The highest tip in the dataset is {highest_tip}."
|
| 246 |
+
history[-1][1] = response
|
| 247 |
+
return response, history
|
| 248 |
+
else:
|
| 249 |
+
response = f"I couldn't find any columns related to tips in the dataset. Available columns are: {', '.join(columns)}"
|
| 250 |
+
history[-1][1] = response
|
| 251 |
+
return response, history
|
| 252 |
+
|
| 253 |
# Create sample data context
|
| 254 |
sample_query = "SELECT * FROM data_tab LIMIT 3;"
|
| 255 |
sample_df = pd.read_sql_query(sample_query, conn)
|
|
|
|
| 257 |
|
| 258 |
# Create question with detailed context
|
| 259 |
question_with_context = f"""
|
| 260 |
+
IMPORTANT: ONLY use the exact columns listed below. DO NOT use any columns not explicitly listed here.
|
| 261 |
+
|
| 262 |
+
The table 'data_tab' has these columns with their types:
|
| 263 |
{columns_str}
|
| 264 |
|
| 265 |
+
Available columns (exact names): {', '.join(columns)}
|
| 266 |
+
|
| 267 |
Here's a sample of the data:
|
| 268 |
{sample_data}
|
| 269 |
|
| 270 |
User question: {query}
|
| 271 |
+
|
| 272 |
+
Remember to ONLY use the columns listed above. If the question seems to require a column that doesn't exist, use the most relevant existing column instead or explain that the data doesn't contain that information.
|
| 273 |
"""
|
| 274 |
|
| 275 |
# Special handling for visualization types that need raw data
|