Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

App Files Files Community

SVashishta1 commited on Apr 24

Commit

2f13356

1 Parent(s): 77df513

Fix: Add direct handling for tip queries and make schema instructions more explicit

Browse files

Files changed (1) hide show

app.py +49 -13

app.py CHANGED Viewed

@@ -68,17 +68,20 @@ current_plot = None
 query_prompt = ChatPromptTemplate.from_template("""
 You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
-Important guidelines:
-1. MOST IMPORTANT: Only use columns that are explicitly provided in the context. Do not assume or invent columns.
-2. Use SQLite syntax (not PostgreSQL or MySQL)
-3. For date functions, use strftime() instead of EXTRACT
    - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
-4. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
-5. For percentiles, use window functions or approximate methods
-6. Keep queries efficient and focused on answering the specific question
-7. Always use 'data_tab' as the table name
-8. IMPORTANT: Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
-9. If the question seems to require a column that isn't provided, use the most relevant existing column instead
 Question: {question}
 """)
@@ -118,7 +121,13 @@ Visualization type: {viz_type}
 # Define the prompt for interpreting the SQL query result
 interpret_prompt = ChatPromptTemplate.from_messages(
     [
-        ("system", "You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary. If relevant, give key statistics, trends, or patterns."),
         ("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
     ]
 )
@@ -209,7 +218,7 @@ def process_text_query(query, history):
             # Connect to the database
             conn = sqlite3.connect(DB_PATH)
-            # Get column information for context
             cursor = conn.cursor()
             cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
             columns_info = cursor.fetchall()
@@ -220,6 +229,27 @@ def process_text_query(query, history):
             columns_with_types = [f"{col} ({typ})" for col, typ in zip(columns, column_types)]
             columns_str = ", ".join(columns_with_types)
             # Create sample data context
             sample_query = "SELECT * FROM data_tab LIMIT 3;"
             sample_df = pd.read_sql_query(sample_query, conn)
@@ -227,13 +257,19 @@ def process_text_query(query, history):
             # Create question with detailed context
             question_with_context = f"""
-The table 'data_tab' has the following columns with their types:
 {columns_str}
 Here's a sample of the data:
 {sample_data}
 User question: {query}
 """
             # Special handling for visualization types that need raw data

 query_prompt = ChatPromptTemplate.from_template("""
 You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
+CRITICAL RULES:
+1. ONLY use columns that are EXPLICITLY provided in the context. DO NOT invent or assume columns exist if they are not listed.
+2. If the user asks about a column that doesn't exist, use a similar column from the available ones or explain that the data doesn't contain that information.
+3. ALWAYS double-check that every column in your query is in the list of available columns.
+Technical guidelines:
+4. Use SQLite syntax (not PostgreSQL or MySQL)
+5. For date functions, use strftime() instead of EXTRACT
    - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
+6. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
+7. For percentiles, use window functions or approximate methods
+8. Keep queries efficient and focused on answering the specific question
+9. Always use 'data_tab' as the table name
+10. Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
 Question: {question}
 """)
 # Define the prompt for interpreting the SQL query result
 interpret_prompt = ChatPromptTemplate.from_messages(
     [
+        ("system", """You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary.
+If relevant, give key statistics, trends, or patterns. Be clear about what the data shows and doesn't show.
+If the SQL query had to use alternative columns because the exact ones requested weren't available, explain this clearly to the user.
+For example, if they asked about 'fare_amount' but the dataset has 'fare' or 'total_fare' instead, mention this substitution."""),
         ("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
     ]
 )
             # Connect to the database
             conn = sqlite3.connect(DB_PATH)
+            # Get schema information FIRST before doing anything else
             cursor = conn.cursor()
             cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
             columns_info = cursor.fetchall()
             columns_with_types = [f"{col} ({typ})" for col, typ in zip(columns, column_types)]
             columns_str = ", ".join(columns_with_types)
+            # Handle specific queries directly based on schema
+            if "highest tip" in query.lower() or "largest tip" in query.lower() or "maximum tip" in query.lower():
+                # Look for tip-related columns
+                tip_columns = [col for col in columns if "tip" in col.lower() or "gratuity" in col.lower()]
+                if tip_columns:
+                    print(f"Found tip-related columns: {tip_columns}")
+                    sql_query = f"SELECT MAX({tip_columns[0]}) AS highest_tip FROM data_tab"
+                    # Execute the query directly
+                    result_df = pd.read_sql_query(sql_query, conn)
+                    # Generate response
+                    highest_tip = result_df.iloc[0, 0]
+                    response = f"The highest tip in the dataset is {highest_tip}."
+                    history[-1][1] = response
+                    return response, history
+                else:
+                    response = f"I couldn't find any columns related to tips in the dataset. Available columns are: {', '.join(columns)}"
+                    history[-1][1] = response
+                    return response, history
             # Create sample data context
             sample_query = "SELECT * FROM data_tab LIMIT 3;"
             sample_df = pd.read_sql_query(sample_query, conn)
             # Create question with detailed context
             question_with_context = f"""
+IMPORTANT: ONLY use the exact columns listed below. DO NOT use any columns not explicitly listed here.
+The table 'data_tab' has these columns with their types:
 {columns_str}
+Available columns (exact names): {', '.join(columns)}
 Here's a sample of the data:
 {sample_data}
 User question: {query}
+Remember to ONLY use the columns listed above. If the question seems to require a column that doesn't exist, use the most relevant existing column instead or explain that the data doesn't contain that information.
 """
             # Special handling for visualization types that need raw data