Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

App Files Files Community

SVashishta1 commited on Mar 3, 2025

Commit

61ce4a6

1 Parent(s): d33fd46

Error Fix

Browse files

Files changed (1) hide show

app.py +67 -103

app.py CHANGED Viewed

@@ -35,64 +35,31 @@ DB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "csv_
 os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
 # Define the prompt with examples
-query_prompt = ChatPromptTemplate.from_messages(
-    [
-        ("system", """
-            You are an SQL and data analysis expert. Generate an appropriate SQL query using SQLite syntax for the question provided, without any explanations or code comments.
-            Follow SQLite-specific conventions, as shown in the examples below:
-            Example 1:
-            Question: "What is the average fare for trips over 10 miles?"
-            SQL Query: SELECT AVG(fare_amount) FROM taxi_data WHERE trip_distance > 10;
-            Example 2:
-            Question: "How many trips were taken in each month?"
-            SQL Query: SELECT strftime('%m', pickup_datetime) AS month, COUNT(*) AS trip_count FROM taxi_data GROUP BY month;
-            Example 3:
-            Question: "What is the total fare amount for each driver (medallion) per day?"
-            SQL Query: SELECT DATE(pickup_datetime) AS date, medallion, SUM(fare_amount) AS total_fare FROM taxi_data GROUP BY date, medallion;
-            SQLite-Specific Conventions:
-            1. Date and Time Extraction:
-               - Instead of `EXTRACT(YEAR FROM column)`, use `strftime('%Y', column)` to extract the year.
-               - Example: `SELECT strftime('%Y', pickup_datetime) FROM taxi_data;`
-            2. String Length:
-               - Instead of `CHAR_LENGTH(column)`, use `LENGTH(column)`.
-               - Example: `SELECT LENGTH(passenger_name) FROM taxi_data;`
-            3. Regular Expressions:
-               - SQLite does not support `REGEXP`. Use `LIKE` for simple patterns or avoid regular expressions.
-               - Example: `SELECT * FROM taxi_data WHERE passenger_name LIKE 'A%';`
-            4. Window Functions:
-               - For row numbering, use `ROW_NUMBER()` if supported, or simulate with joins.
-               - Example: `SELECT id, ROW_NUMBER() OVER (ORDER BY pickup_datetime) AS row_num FROM taxi_data;`
-            5. Data Type Casting:
-               - Use `CAST(column AS TYPE)`, but note that SQLite supports limited types.
-               - Example: `SELECT CAST(fare_amount AS INTEGER) FROM taxi_data;`
-            6. Full Outer Join Workaround:
-               - SQLite doesn't support `FULL OUTER JOIN`. Combine `LEFT JOIN` and `UNION` for a similar effect.
-               - Example:
-                 ```
-                 SELECT a.*, b.*
-                 FROM table_a a
-                 LEFT JOIN table_b b ON a.id = b.id
-                 UNION
-                 SELECT a.*, b.*
-                 FROM table_a a
-                 RIGHT JOIN table_b b ON a.id = b.id;
-                 ```
-            Use these examples and guidelines to generate an SQL query compatible with SQLite syntax for the question provided.
-        """),
-        ("human", "{question}"),
-    ]
-)
 # Define the prompt for interpreting the SQL query result
 interpret_prompt = ChatPromptTemplate.from_messages(
@@ -107,86 +74,83 @@ def process_text_query(query, history):
     if not query:
         return "", history
-    # More specific SQL detection - look for actual SQL-like patterns or explicit SQL requests
-    sql_keywords = ['select', 'from', 'where', 'group by', 'order by', 'having', 'join']
-    data_analysis_keywords = ['average', 'count', 'sum', 'maximum', 'minimum', 'mean', 'analyze', 'calculate']
-    # Check if this is explicitly about the CSV/database data
-    is_sql_query = (
-        any(keyword in query.lower() for keyword in sql_keywords) or
-        ('csv' in query.lower() and any(keyword in query.lower() for keyword in data_analysis_keywords)) or
-        'database' in query.lower() or
-        'table' in query.lower()
-    )
     try:
-        # Connect to the SQLite database to check if we have any tables
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
         cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
         tables = [row[0] for row in cursor.fetchall()]
-        conn.close()
-        if is_sql_query and tables:
-            try:
-                conn = sqlite3.connect(DB_PATH)
-                cursor = conn.cursor()
-                # Build context with table information
-                table_info = []
-                for table in tables:
-                    cursor.execute(f"PRAGMA table_info({table});")
-                    columns = [f"{col[1]} ({col[2]})" for col in cursor.fetchall()]
-                    table_info.append(f"Table '{table}' has columns: {', '.join(columns)}")
-                # Create question with context
-                question_with_context = f"The database contains the following tables:\n{chr(10).join(table_info)}\n\n{query}"
-                # Generate SQL query using the query engine
-                sql_query = query_engine.generate_response(query_prompt.format(question=question_with_context))
-                # Verify the response is actually a SQL query
-                if not any(keyword in sql_query.lower() for keyword in ['select', 'from']):
-                    raise ValueError("Generated response is not a valid SQL query")
                 try:
-                    # Execute the query
                     result_df = pd.read_sql_query(sql_query, conn)
-                    # Format the data for the interpretation
                     if len(result_df) > 10:
                         data_str = f"{result_df.head(10).to_string()}\n... (showing 10 of {len(result_df)} rows)"
                     else:
                         data_str = result_df.to_string()
-                    # Create the response
                     response = f"**SQL Query:**\n```sql\n{sql_query}\n```\n\n"
                     if not result_df.empty:
                         response += f"**Results:**\n```\n{data_str}\n```\n\n"
                     else:
-                        response += "**No results found.**\n\n"
                 except Exception as e:
-                    response = f"**SQL Query:**\n```sql\n{sql_query}\n```\n\n**Error executing query:** {str(e)}"
-                conn.close()
-            except Exception as e:
-                # If there's an error with SQL processing, fall back to document query
                 response = document_assistant.process_query(query)
         else:
-            # Process regular document query
             response = document_assistant.process_query(query)
     except Exception as e:
-        # If there's any database connection error, fall back to document query
         response = document_assistant.process_query(query)
-    # Update history with message format
     history.append({"role": "user", "content": query})
     history.append({"role": "assistant", "content": response})
     return "", history
 def process_file_upload(files):

 os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
 # Define the prompt with examples
+query_prompt = ChatPromptTemplate.from_messages([
+    ("system", """You are an SQL expert. Generate an appropriate SQL query using SQLite syntax for the question provided. The query should be executable and return exactly what was asked for.
+For questions about maximum/highest values, use MAX().
+For minimum/lowest values, use MIN().
+For averages, use AVG().
+For counts, use COUNT().
+For sums, use SUM().
+Examples:
+1. Question: "What is the highest tip_amount in the dataset?"
+   SQL: SELECT MAX(tip_amount) as highest_tip FROM data_tab;
+2. Question: "What is the average fare amount?"
+   SQL: SELECT AVG(fare_amount) as average_fare FROM data_tab;
+3. Question: "How many trips are there?"
+   SQL: SELECT COUNT(*) as trip_count FROM data_tab;
+4. Question: "What are the top 5 highest tip amounts?"
+   SQL: SELECT * FROM data_tab ORDER BY tip_amount DESC LIMIT 5;
+Generate only the SQL query, nothing else. Make sure to use the correct table name from the context provided."""),
+    ("human", "{question}")
+])
 # Define the prompt for interpreting the SQL query result
 interpret_prompt = ChatPromptTemplate.from_messages(
     if not query:
         return "", history
+    # First, check if we have any CSV data loaded
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
         cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
         tables = [row[0] for row in cursor.fetchall()]
+        if tables:
+            # Get table schema information
+            table_info = []
+            for table in tables:
+                cursor.execute(f"PRAGMA table_info({table});")
+                columns = [f"{col[1]} ({col[2]})" for col in cursor.fetchall()]
+                table_info.append(f"Table '{table}' has columns: {', '.join(columns)}")
+            # For questions about specific values, aggregations, or data analysis
+            if any(word in query.lower() for word in [
+                'what is', 'how many', 'highest', 'lowest', 'maximum', 'minimum',
+                'average', 'mean', 'sum', 'total', 'count', 'tip', 'fare', 'amount'
+            ]):
                 try:
+                    # Generate SQL query
+                    context = f"The database contains the following tables:\n{chr(10).join(table_info)}\n\nQuestion: {query}"
+                    sql_query = query_engine.generate_response(query_prompt.format(question=context))
+                    # Execute query
                     result_df = pd.read_sql_query(sql_query, conn)
+                    # Format results
                     if len(result_df) > 10:
                         data_str = f"{result_df.head(10).to_string()}\n... (showing 10 of {len(result_df)} rows)"
                     else:
                         data_str = result_df.to_string()
+                    # Generate response
                     response = f"**SQL Query:**\n```sql\n{sql_query}\n```\n\n"
                     if not result_df.empty:
                         response += f"**Results:**\n```\n{data_str}\n```\n\n"
+                        # Add interpretation
+                        interpret_prompt = f"""
+                        Question: {query}
+                        SQL Query: {sql_query}
+                        Results: {data_str}
+                        Please provide a clear, concise answer to the question based on these results.
+                        """
+                        interpretation = query_engine.generate_response(interpret_prompt)
+                        response += f"**Answer:**\n{interpretation}"
                     else:
+                        response += "No results found."
+                    history.append({"role": "user", "content": query})
+                    history.append({"role": "assistant", "content": response})
+                    return "", history
                 except Exception as e:
+                    print(f"SQL Error: {str(e)}")
+                    # Fall back to document query if SQL fails
+                    response = document_assistant.process_query(query)
+            else:
+                # For non-data analysis questions, use document query
                 response = document_assistant.process_query(query)
         else:
+            # No tables found, use document query
             response = document_assistant.process_query(query)
+        conn.close()
     except Exception as e:
+        print(f"Database Error: {str(e)}")
+        # Fall back to document query if database access fails
         response = document_assistant.process_query(query)
+    # Update history
     history.append({"role": "user", "content": query})
     history.append({"role": "assistant", "content": response})
     return "", history
 def process_file_upload(files):