Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

App Files Files Community

SVashishta1 commited on Apr 24, 2025

Commit

61a8a66

1 Parent(s): 2f13356

Cleanup: Remove unused voice library comments

Browse files

Files changed (1) hide show

app.py +48 -203

app.py CHANGED Viewed

@@ -62,76 +62,65 @@ current_context = {
 }
 # Add a global variable to store the current plot
-current_plot = None
 # Define the prompt with examples for SQL query generation
 query_prompt = ChatPromptTemplate.from_template("""
 You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
-CRITICAL RULES:
-1. ONLY use columns that are EXPLICITLY provided in the context. DO NOT invent or assume columns exist if they are not listed.
-2. If the user asks about a column that doesn't exist, use a similar column from the available ones or explain that the data doesn't contain that information.
-3. ALWAYS double-check that every column in your query is in the list of available columns.
-Technical guidelines:
-4. Use SQLite syntax (not PostgreSQL or MySQL)
-5. For date functions, use strftime() instead of EXTRACT
    - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
-6. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
-7. For percentiles, use window functions or approximate methods
-8. Keep queries efficient and focused on answering the specific question
-9. Always use 'data_tab' as the table name
-10. Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
 Question: {question}
 """)
-# Add this after the query_prompt definition
-visualization_prompt = ChatPromptTemplate.from_template("""
-You are a data visualization expert. Given a question about visualizing data, write a SQLite-compatible SQL query that will retrieve the appropriate data for the visualization.
-Important guidelines for SQLite syntax:
-1. Use strftime() for date functions:
-   - Year: strftime('%Y', date_column)
-   - Month: strftime('%m', date_column)
-   - Day: strftime('%d', date_column)
-   - Hour: strftime('%H', date_column)
-2. For histograms and binning:
-   - Use: CAST((column / bin_size) AS INT) * bin_size
-   - Example: CAST((trip_distance / 0.5) AS INT) * 0.5 AS distance_bin
-3. For box plots:
-   - SQLite doesn't support PERCENTILE_CONT or window functions
-   - Simply return the raw data column: SELECT column_name FROM data_tab
-   - The application will calculate quartiles and outliers
-4. For heatmaps:
-   - Return raw data for correlation analysis
-   - Example: SELECT numeric_col1, numeric_col2, numeric_col3 FROM data_tab
-5. Always use 'data_tab' as the table name
-6. IMPORTANT: Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
-Question: {question}
-Visualization type: {viz_type}
-""")
 # Define the prompt for interpreting the SQL query result
 interpret_prompt = ChatPromptTemplate.from_messages(
     [
-        ("system", """You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary.
-If relevant, give key statistics, trends, or patterns. Be clear about what the data shows and doesn't show.
-If the SQL query had to use alternative columns because the exact ones requested weren't available, explain this clearly to the user.
-For example, if they asked about 'fare_amount' but the dataset has 'fare' or 'total_fare' instead, mention this substitution."""),
         ("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
     ]
 )
 # Add this helper function to clean SQL queries
 def clean_sql_query(query_text):
     """Clean SQL query text by removing markdown formatting and comments"""
@@ -218,59 +207,14 @@ def process_text_query(query, history):
             # Connect to the database
             conn = sqlite3.connect(DB_PATH)
-            # Get schema information FIRST before doing anything else
             cursor = conn.cursor()
             cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
-            columns_info = cursor.fetchall()
-            columns = [info[1] for info in columns_info]
-            column_types = [info[2] for info in columns_info]
-            # Create rich context with column types
-            columns_with_types = [f"{col} ({typ})" for col, typ in zip(columns, column_types)]
-            columns_str = ", ".join(columns_with_types)
-            # Handle specific queries directly based on schema
-            if "highest tip" in query.lower() or "largest tip" in query.lower() or "maximum tip" in query.lower():
-                # Look for tip-related columns
-                tip_columns = [col for col in columns if "tip" in col.lower() or "gratuity" in col.lower()]
-                if tip_columns:
-                    print(f"Found tip-related columns: {tip_columns}")
-                    sql_query = f"SELECT MAX({tip_columns[0]}) AS highest_tip FROM data_tab"
-                    # Execute the query directly
-                    result_df = pd.read_sql_query(sql_query, conn)
-                    # Generate response
-                    highest_tip = result_df.iloc[0, 0]
-                    response = f"The highest tip in the dataset is {highest_tip}."
-                    history[-1][1] = response
-                    return response, history
-                else:
-                    response = f"I couldn't find any columns related to tips in the dataset. Available columns are: {', '.join(columns)}"
-                    history[-1][1] = response
-                    return response, history
-            # Create sample data context
-            sample_query = "SELECT * FROM data_tab LIMIT 3;"
-            sample_df = pd.read_sql_query(sample_query, conn)
-            sample_data = sample_df.to_string(index=False, max_rows=3)
-            # Create question with detailed context
-            question_with_context = f"""
-IMPORTANT: ONLY use the exact columns listed below. DO NOT use any columns not explicitly listed here.
-The table 'data_tab' has these columns with their types:
-{columns_str}
-Available columns (exact names): {', '.join(columns)}
-Here's a sample of the data:
-{sample_data}
-User question: {query}
-Remember to ONLY use the columns listed above. If the question seems to require a column that doesn't exist, use the most relevant existing column instead or explain that the data doesn't contain that information.
-"""
             # Special handling for visualization types that need raw data
             if is_visualization and viz_type in ['box', 'heatmap']:
@@ -314,112 +258,13 @@ Remember to ONLY use the columns listed above. If the question seems to require
                         sql_query = f"SELECT {cols_str} FROM data_tab WHERE {numeric_cols[0]} IS NOT NULL LIMIT 1000;"
                     else:
                         sql_query = "SELECT * FROM data_tab LIMIT 10;"
-            elif is_visualization:
-                # For visualization queries, use the specialized visualization prompt
-                sql_query = llm.invoke(visualization_prompt.format(
-                    question=question_with_context,
-                    viz_type=viz_type or "bar"
-                )).content
-                sql_query = clean_sql_query(sql_query)
             else:
                 # For other queries, use the LLM to generate SQL
                 sql_query = llm.invoke(query_prompt.format(question=question_with_context)).content
                 sql_query = clean_sql_query(sql_query)
-            # Check if all columns in the query exist before executing
-            try:
-                # Get all column names
-                cursor.execute("PRAGMA table_info(data_tab);")
-                available_columns = [info[1] for info in cursor.fetchall()]
-                # Extract column names from the SQL query (simple approach)
-                query_columns = []
-                from_pos = sql_query.lower().find("from")
-                if from_pos > 0:
-                    select_part = sql_query[:from_pos].lower()
-                    # Remove SELECT keyword
-                    if select_part.startswith("select "):
-                        select_part = select_part[7:]
-                    # Split by commas and extract column names
-                    for col_expr in select_part.split(","):
-                        col_expr = col_expr.strip()
-                        # Handle AS aliases and functions
-                        if " as " in col_expr:
-                            col_expr = col_expr.split(" as ")[0].strip()
-                        # Extract column name from functions
-                        for func in ["max(", "min(", "avg(", "sum(", "count("]:
-                            if func in col_expr:
-                                # Extract column inside function
-                                start_idx = col_expr.find(func) + len(func)
-                                end_idx = col_expr.find(")", start_idx)
-                                if end_idx > start_idx:
-                                    col_name = col_expr[start_idx:end_idx].strip()
-                                    if col_name != "*" and "(" not in col_name:  # Skip nested functions and *
-                                        query_columns.append(col_name)
-                        # Handle direct column references
-                        if "(" not in col_expr and col_expr != "*":
-                            query_columns.append(col_expr)
-                # Check for missing columns
-                missing_columns = []
-                for col in query_columns:
-                    if col not in available_columns and col.strip() != "*":
-                        missing_columns.append(col)
-                if missing_columns:
-                    # Generate a simpler query with available columns
-                    if "tip" in query.lower() or "gratuity" in query.lower():
-                        # Look for a tip column
-                        tip_columns = [col for col in available_columns if "tip" in col.lower() or "gratuity" in col.lower()]
-                        if tip_columns:
-                            sql_query = f"SELECT MAX({tip_columns[0]}) AS highest_tip FROM data_tab"
-                        else:
-                            # No tip column, return info about available columns
-                            return f"I couldn't find a column related to tips or gratuity. Available columns are: {', '.join(available_columns)}", history
-                    else:
-                        # For other queries, suggest a generic query
-                        return f"Some columns in the query don't exist in the current dataset: {', '.join(missing_columns)}. Available columns are: {', '.join(available_columns)}", history
-            except Exception as e:
-                print(f"Error checking columns: {str(e)}")
-                # Continue with the original query
             # Execute the query
-            try:
-                result_df = pd.read_sql_query(sql_query, conn)
-            except Exception as e:
-                error_message = str(e)
-                # Try to provide a more helpful error message
-                if "no such column" in error_message.lower():
-                    # Extract column name from error
-                    column_name = error_message.split("no such column: ")[-1].strip("'").strip('"')
-                    # Look for similar columns
-                    cursor.execute("PRAGMA table_info(data_tab);")
-                    available_columns = [info[1] for info in cursor.fetchall()]
-                    # Simple fuzzy matching
-                    similar_columns = []
-                    for col in available_columns:
-                        # Check if column name contains parts of the error column
-                        if column_name.lower() in col.lower() or any(part.lower() in col.lower() for part in column_name.split('_') if len(part) > 2):
-                            similar_columns.append(col)
-                    if similar_columns:
-                        message = f"Column '{column_name}' doesn't exist in the current dataset. Did you mean one of these? {', '.join(similar_columns)}\n\nAvailable columns are: {', '.join(available_columns)}"
-                    else:
-                        message = f"Column '{column_name}' doesn't exist in the current dataset. Available columns are: {', '.join(available_columns)}"
-                    history[-1][1] = message
-                    return message, history
-                else:
-                    # Generic error message
-                    error_msg = f"Error executing query: {error_message}"
-                    history[-1][1] = error_msg
-                    return error_msg, history
             # Close the connection
             conn.close()

 }
 # Add a global variable to store the current plot
+# current_plot = None
 # Define the prompt with examples for SQL query generation
 query_prompt = ChatPromptTemplate.from_template("""
 You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
+Important guidelines:
+1. Use SQLite syntax (not PostgreSQL or MySQL)
+2. For date functions, use strftime() instead of EXTRACT
    - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
+3. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
+4. For percentiles, use window functions or approximate methods
+5. Keep queries efficient and focused on answering the specific question
+6. Always use 'data_tab' as the table name
+7. IMPORTANT: Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
 Question: {question}
 """)
 # Define the prompt for interpreting the SQL query result
 interpret_prompt = ChatPromptTemplate.from_messages(
     [
+        ("system", "You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary. If relevant, give key statistics, trends, or patterns."),
         ("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
     ]
 )
+# Add this after the query_prompt definition
+# visualization_prompt = ChatPromptTemplate.from_template("""
+# You are a data visualization expert. Given a question about visualizing data, write a SQLite-compatible SQL query that will retrieve the appropriate data for the visualization.
+#
+# Important guidelines for SQLite syntax:
+# 1. Use strftime() for date functions:
+#    - Year: strftime('%Y', date_column)
+#    - Month: strftime('%m', date_column)
+#    - Day: strftime('%d', date_column)
+#    - Hour: strftime('%H', date_column)
+#
+# 2. For histograms and binning:
+#    - Use: CAST((column / bin_size) AS INT) * bin_size
+#    - Example: CAST((trip_distance / 0.5) AS INT) * 0.5 AS distance_bin
+#
+# 3. For box plots:
+#    - SQLite doesn't support PERCENTILE_CONT or window functions
+#    - Simply return the raw data column: SELECT column_name FROM data_tab
+#    - The application will calculate quartiles and outliers
+#
+# 4. For heatmaps:
+#    - Return raw data for correlation analysis
+#    - Example: SELECT numeric_col1, numeric_col2, numeric_col3 FROM data_tab
+#
+# 5. Always use 'data_tab' as the table name
+#
+# 6. IMPORTANT: Return ONLY the SQL query without any markdown formatting, explanations, or code blocks
+#
+# Question: {question}
+# Visualization type: {viz_type}
+# """)
 # Add this helper function to clean SQL queries
 def clean_sql_query(query_text):
     """Clean SQL query text by removing markdown formatting and comments"""
             # Connect to the database
             conn = sqlite3.connect(DB_PATH)
+            # Get column information for context
             cursor = conn.cursor()
             cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
+            columns = [info[1] for info in cursor.fetchall()]
+            columns_str = ", ".join(columns)
+            # Create question with context
+            question_with_context = f"The table 'data_tab' has columns: {columns_str}. {query}"
             # Special handling for visualization types that need raw data
             if is_visualization and viz_type in ['box', 'heatmap']:
                         sql_query = f"SELECT {cols_str} FROM data_tab WHERE {numeric_cols[0]} IS NOT NULL LIMIT 1000;"
                     else:
                         sql_query = "SELECT * FROM data_tab LIMIT 10;"
             else:
                 # For other queries, use the LLM to generate SQL
                 sql_query = llm.invoke(query_prompt.format(question=question_with_context)).content
                 sql_query = clean_sql_query(sql_query)
             # Execute the query
+            result_df = pd.read_sql_query(sql_query, conn)
             # Close the connection
             conn.close()