Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

App Files Files Community

SVashishta1 commited on Apr 24, 2025

Commit

77df513

1 Parent(s): e770679

Fix: Improve SQL query generation with better column checking and error handling

Browse files

Files changed (1) hide show

app.py +124 -12

app.py CHANGED Viewed

@@ -69,14 +69,16 @@ query_prompt = ChatPromptTemplate.from_template("""
 You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
 Important guidelines:
-1. Use SQLite syntax (not PostgreSQL or MySQL)
-2. For date functions, use strftime() instead of EXTRACT
    - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
-3. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
-4. For percentiles, use window functions or approximate methods
-5. Keep queries efficient and focused on answering the specific question
-6. Always use 'data_tab' as the table name
-7. IMPORTANT: Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
 Question: {question}
 """)
@@ -210,11 +212,29 @@ def process_text_query(query, history):
             # Get column information for context
             cursor = conn.cursor()
             cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
-            columns = [info[1] for info in cursor.fetchall()]
-            columns_str = ", ".join(columns)
-            # Create question with context
-            question_with_context = f"The table 'data_tab' has columns: {columns_str}. {query}"
             # Special handling for visualization types that need raw data
             if is_visualization and viz_type in ['box', 'heatmap']:
@@ -270,8 +290,100 @@ def process_text_query(query, history):
                 sql_query = llm.invoke(query_prompt.format(question=question_with_context)).content
                 sql_query = clean_sql_query(sql_query)
             # Execute the query
-            result_df = pd.read_sql_query(sql_query, conn)
             # Close the connection
             conn.close()

 You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
 Important guidelines:
+1. MOST IMPORTANT: Only use columns that are explicitly provided in the context. Do not assume or invent columns.
+2. Use SQLite syntax (not PostgreSQL or MySQL)
+3. For date functions, use strftime() instead of EXTRACT
    - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
+4. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
+5. For percentiles, use window functions or approximate methods
+6. Keep queries efficient and focused on answering the specific question
+7. Always use 'data_tab' as the table name
+8. IMPORTANT: Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
+9. If the question seems to require a column that isn't provided, use the most relevant existing column instead
 Question: {question}
 """)
             # Get column information for context
             cursor = conn.cursor()
             cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
+            columns_info = cursor.fetchall()
+            columns = [info[1] for info in columns_info]
+            column_types = [info[2] for info in columns_info]
+            # Create rich context with column types
+            columns_with_types = [f"{col} ({typ})" for col, typ in zip(columns, column_types)]
+            columns_str = ", ".join(columns_with_types)
+            # Create sample data context
+            sample_query = "SELECT * FROM data_tab LIMIT 3;"
+            sample_df = pd.read_sql_query(sample_query, conn)
+            sample_data = sample_df.to_string(index=False, max_rows=3)
+            # Create question with detailed context
+            question_with_context = f"""
+The table 'data_tab' has the following columns with their types:
+{columns_str}
+Here's a sample of the data:
+{sample_data}
+User question: {query}
+"""
             # Special handling for visualization types that need raw data
             if is_visualization and viz_type in ['box', 'heatmap']:
                 sql_query = llm.invoke(query_prompt.format(question=question_with_context)).content
                 sql_query = clean_sql_query(sql_query)
+            # Check if all columns in the query exist before executing
+            try:
+                # Get all column names
+                cursor.execute("PRAGMA table_info(data_tab);")
+                available_columns = [info[1] for info in cursor.fetchall()]
+                # Extract column names from the SQL query (simple approach)
+                query_columns = []
+                from_pos = sql_query.lower().find("from")
+                if from_pos > 0:
+                    select_part = sql_query[:from_pos].lower()
+                    # Remove SELECT keyword
+                    if select_part.startswith("select "):
+                        select_part = select_part[7:]
+                    # Split by commas and extract column names
+                    for col_expr in select_part.split(","):
+                        col_expr = col_expr.strip()
+                        # Handle AS aliases and functions
+                        if " as " in col_expr:
+                            col_expr = col_expr.split(" as ")[0].strip()
+                        # Extract column name from functions
+                        for func in ["max(", "min(", "avg(", "sum(", "count("]:
+                            if func in col_expr:
+                                # Extract column inside function
+                                start_idx = col_expr.find(func) + len(func)
+                                end_idx = col_expr.find(")", start_idx)
+                                if end_idx > start_idx:
+                                    col_name = col_expr[start_idx:end_idx].strip()
+                                    if col_name != "*" and "(" not in col_name:  # Skip nested functions and *
+                                        query_columns.append(col_name)
+                        # Handle direct column references
+                        if "(" not in col_expr and col_expr != "*":
+                            query_columns.append(col_expr)
+                # Check for missing columns
+                missing_columns = []
+                for col in query_columns:
+                    if col not in available_columns and col.strip() != "*":
+                        missing_columns.append(col)
+                if missing_columns:
+                    # Generate a simpler query with available columns
+                    if "tip" in query.lower() or "gratuity" in query.lower():
+                        # Look for a tip column
+                        tip_columns = [col for col in available_columns if "tip" in col.lower() or "gratuity" in col.lower()]
+                        if tip_columns:
+                            sql_query = f"SELECT MAX({tip_columns[0]}) AS highest_tip FROM data_tab"
+                        else:
+                            # No tip column, return info about available columns
+                            return f"I couldn't find a column related to tips or gratuity. Available columns are: {', '.join(available_columns)}", history
+                    else:
+                        # For other queries, suggest a generic query
+                        return f"Some columns in the query don't exist in the current dataset: {', '.join(missing_columns)}. Available columns are: {', '.join(available_columns)}", history
+            except Exception as e:
+                print(f"Error checking columns: {str(e)}")
+                # Continue with the original query
             # Execute the query
+            try:
+                result_df = pd.read_sql_query(sql_query, conn)
+            except Exception as e:
+                error_message = str(e)
+                # Try to provide a more helpful error message
+                if "no such column" in error_message.lower():
+                    # Extract column name from error
+                    column_name = error_message.split("no such column: ")[-1].strip("'").strip('"')
+                    # Look for similar columns
+                    cursor.execute("PRAGMA table_info(data_tab);")
+                    available_columns = [info[1] for info in cursor.fetchall()]
+                    # Simple fuzzy matching
+                    similar_columns = []
+                    for col in available_columns:
+                        # Check if column name contains parts of the error column
+                        if column_name.lower() in col.lower() or any(part.lower() in col.lower() for part in column_name.split('_') if len(part) > 2):
+                            similar_columns.append(col)
+                    if similar_columns:
+                        message = f"Column '{column_name}' doesn't exist in the current dataset. Did you mean one of these? {', '.join(similar_columns)}\n\nAvailable columns are: {', '.join(available_columns)}"
+                    else:
+                        message = f"Column '{column_name}' doesn't exist in the current dataset. Available columns are: {', '.join(available_columns)}"
+                    history[-1][1] = message
+                    return message, history
+                else:
+                    # Generic error message
+                    error_msg = f"Error executing query: {error_message}"
+                    history[-1][1] = error_msg
+                    return error_msg, history
             # Close the connection
             conn.close()