Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

App Files Files Community

SVashishta1 commited on Mar 10

Commit

e3d98a2

1 Parent(s): a8c9793

Error Fix

Browse files

Files changed (1) hide show

app.py +113 -75

app.py CHANGED Viewed

@@ -54,73 +54,22 @@ current_context = {
 current_plot = None
 # Define the prompt with examples for SQL query generation
-query_prompt = ChatPromptTemplate.from_messages(
-    [
-        ("system", """
-            You are an SQL and data analysis expert. Generate an appropriate SQL query using SQLite syntax for the question provided, without any explanations or code comments.
-            Follow SQLite-specific conventions, as shown in the examples below:
-            Example 1:
-            Question: "What is the average fare for trips over 10 miles?"
-            SQL Query: SELECT AVG(fare_amount) FROM data_tab WHERE trip_distance > 10;
-            Example 2:
-            Question: "How many trips were taken in each month?"
-            SQL Query: SELECT strftime('%m', pickup_datetime) AS month, COUNT(*) AS trip_count FROM data_tab GROUP BY month;
-            Example 3:
-            Question: "What is the total fare amount for each driver (medallion) per day?"
-            SQL Query: SELECT DATE(pickup_datetime) AS date, medallion, SUM(fare_amount) AS total_fare FROM data_tab GROUP BY date, medallion;
-            Example 4:
-            Question: "What is the highest tip amount in the dataset?"
-            SQL Query: SELECT MAX(tip_amount) as highest_tip FROM data_tab;
-            Example 5:
-            Question: "Plot a bar graph for tip trends by month"
-            SQL Query: SELECT strftime('%Y-%m', pickup_datetime) as month, AVG(tip_amount) as avg_tip, COUNT(*) as count FROM data_tab GROUP BY month ORDER BY month;
-            SQLite-Specific Conventions:
-            1. Date and Time Extraction:
-               - Instead of `EXTRACT(YEAR FROM column)`, use `strftime('%Y', column)` to extract the year.
-               - Example: `SELECT strftime('%Y', pickup_datetime) FROM data_tab;`
-            2. String Length:
-               - Instead of `CHAR_LENGTH(column)`, use `LENGTH(column)`.
-               - Example: `SELECT LENGTH(passenger_name) FROM data_tab;`
-            3. Regular Expressions:
-               - SQLite does not support `REGEXP`. Use `LIKE` for simple patterns or avoid regular expressions.
-               - Example: `SELECT * FROM data_tab WHERE passenger_name LIKE 'A%';`
-            4. Window Functions:
-               - For row numbering, use `ROW_NUMBER()` if supported, or simulate with joins.
-               - Example: `SELECT id, ROW_NUMBER() OVER (ORDER BY pickup_datetime) AS row_num FROM data_tab;`
-            5. Data Type Casting:
-               - Use `CAST(column AS TYPE)`, but note that SQLite supports limited types.
-               - Example: `SELECT CAST(fare_amount AS INTEGER) FROM data_tab;`
-            6. Full Outer Join Workaround:
-               - SQLite doesn't support `FULL OUTER JOIN`. Combine `LEFT JOIN` and `UNION` for a similar effect.
-               - Example:
-                 ```
-                 SELECT a.*, b.*
-                 FROM table_a a
-                 LEFT JOIN table_b b ON a.id = b.id
-                 UNION
-                 SELECT a.*, b.*
-                 FROM table_a a
-                 RIGHT JOIN table_b b ON a.id = b.id;
-                 ```
-            Use these examples and guidelines to generate an SQL query compatible with SQLite syntax for the question provided.
-            Always use 'data_tab' as the table name.
-        """),
-        ("human", "{question}"),
-    ]
-)
 # Define the prompt for interpreting the SQL query result
 interpret_prompt = ChatPromptTemplate.from_messages(
@@ -130,6 +79,37 @@ interpret_prompt = ChatPromptTemplate.from_messages(
     ]
 )
 def process_text_query(query, history):
     """Process a text query and update chat history"""
     if not query:
@@ -205,28 +185,86 @@ def process_text_query(query, history):
                 if is_visualization and not result_df.empty:
                     try:
                         print("Visualization requested, attempting to create plot...")
-                        # Determine the type of visualization based on the data
                         if len(result_df.columns) >= 2:
                             # Find numeric columns for y-axis
                             numeric_cols = result_df.select_dtypes(include=['number']).columns.tolist()
                             if len(numeric_cols) >= 1 and len(result_df) > 1:
-                                # Use the first column as x and first numeric column as y
-                                x_col = result_df.columns[0]
-                                y_cols = numeric_cols[:3]  # Use up to 3 numeric columns
-                                print(f"Creating plot with x={x_col}, y={y_cols}")
-                                # Create appropriate plot based on data characteristics
-                                if 'month' in result_df.columns or 'date' in result_df.columns or 'year' in result_df.columns or any('date' in col.lower() for col in result_df.columns):
                                     # Time series data - use line chart
-                                    fig = px.line(result_df, x=x_col, y=y_cols, title="Time Series Analysis")
                                 else:
                                     # Regular data - use bar chart
-                                    fig = px.bar(result_df, x=x_col, y=y_cols[0], title="Data Visualization")
                                 # Convert the figure to an image and encode it as base64
-                                img_bytes = fig.to_image(format="png", width=800, height=500)
                                 encoded = base64.b64encode(img_bytes).decode("ascii")
                                 img_src = f"data:image/png;base64,{encoded}"

 current_plot = None
 # Define the prompt with examples for SQL query generation
+query_prompt = ChatPromptTemplate.from_template("""
+You are a SQL expert. Given a question about data in a table, write a SQLite-compatible SQL query to answer the question.
+Important guidelines:
+1. Use SQLite syntax (not PostgreSQL or MySQL)
+2. For date functions, use strftime() instead of EXTRACT
+   - Example: strftime('%Y', date_column) instead of EXTRACT(YEAR FROM date_column)
+3. SQLite doesn't have TRUNCATE function, use CAST((column / bin_size) AS INT) * bin_size instead
+4. For percentiles, use window functions or approximate methods
+5. Keep queries efficient and focused on answering the specific question
+6. Always use 'data_tab' as the table name
+Question: {question}
+SQL Query:
+""")
 # Define the prompt for interpreting the SQL query result
 interpret_prompt = ChatPromptTemplate.from_messages(
     ]
 )
+# Add this after the query_prompt definition
+visualization_prompt = ChatPromptTemplate.from_template("""
+You are a data visualization expert. Given a question about visualizing data, write a SQLite-compatible SQL query that will retrieve the appropriate data for the visualization.
+Important guidelines for SQLite syntax:
+1. Use strftime() for date functions:
+   - Year: strftime('%Y', date_column)
+   - Month: strftime('%m', date_column)
+   - Day: strftime('%d', date_column)
+   - Hour: strftime('%H', date_column)
+2. For histograms and binning:
+   - Use: CAST((column / bin_size) AS INT) * bin_size
+   - Example: CAST((trip_distance / 0.5) AS INT) * 0.5 AS distance_bin
+3. For percentiles and statistics:
+   - SQLite doesn't have built-in percentile functions
+   - Use simple aggregations (MIN, MAX, AVG, COUNT) instead
+4. For time series:
+   - Group by date parts using strftime()
+   - Example: strftime('%Y-%m-%d', pickup_datetime) AS day
+5. Always use 'data_tab' as the table name
+Question: {question}
+Visualization type: {viz_type}
+SQL Query:
+""")
 def process_text_query(query, history):
     """Process a text query and update chat history"""
     if not query:
                 if is_visualization and not result_df.empty:
                     try:
                         print("Visualization requested, attempting to create plot...")
+                        # Determine the type of visualization based on the data and query
+                        # Check for specific visualization types in the query
+                        is_pie_chart = any(word in query.lower() for word in ['pie chart', 'pie graph', 'distribution'])
+                        is_histogram = any(word in query.lower() for word in ['histogram', 'distribution of', 'frequency'])
+                        is_heatmap = any(word in query.lower() for word in ['heatmap', 'heat map', 'correlation'])
+                        is_scatter = any(word in query.lower() for word in ['scatter', 'relationship between', 'correlation'])
                         if len(result_df.columns) >= 2:
                             # Find numeric columns for y-axis
                             numeric_cols = result_df.select_dtypes(include=['number']).columns.tolist()
                             if len(numeric_cols) >= 1 and len(result_df) > 1:
+                                # Create appropriate plot based on query and data characteristics
+                                if is_pie_chart and len(result_df) <= 20:  # Pie charts work best with limited categories
+                                    # For pie charts, we need a category column and a value column
+                                    category_col = result_df.columns[0]
+                                    value_col = numeric_cols[0] if len(numeric_cols) > 0 else result_df.columns[1]
+                                    fig = px.pie(result_df, names=category_col, values=value_col,
+                                                title="Distribution Analysis",
+                                                hole=0.3)  # Use a donut chart for better readability
+                                elif is_histogram and len(numeric_cols) > 0:
+                                    # For histograms, we need a numeric column
+                                    fig = px.histogram(result_df, x=numeric_cols[0],
+                                                    title=f"Distribution of {numeric_cols[0]}",
+                                                    nbins=20)
+                                elif is_heatmap and len(numeric_cols) >= 2:
+                                    # For heatmaps, we need at least 2 numeric columns
+                                    # Convert to a correlation matrix if needed
+                                    if len(result_df.columns) == len(numeric_cols) and len(numeric_cols) > 2:
+                                        # This is likely already a correlation matrix or similar data
+                                        fig = px.imshow(result_df,
+                                                    title="Correlation Heatmap",
+                                                    color_continuous_scale='RdBu_r',
+                                                    aspect="auto")
+                                    else:
+                                        # Create a correlation matrix from the numeric columns
+                                        corr_df = result_df[numeric_cols].corr()
+                                        fig = px.imshow(corr_df,
+                                                    title="Correlation Heatmap",
+                                                    color_continuous_scale='RdBu_r',
+                                                    aspect="auto")
+                                elif is_scatter and len(numeric_cols) >= 2:
+                                    # For scatter plots, we need at least 2 numeric columns
+                                    fig = px.scatter(result_df, x=numeric_cols[0], y=numeric_cols[1],
+                                                    title=f"Relationship between {numeric_cols[0]} and {numeric_cols[1]}",
+                                                    opacity=0.7)
+                                elif 'month' in result_df.columns or 'date' in result_df.columns or 'year' in result_df.columns or any('date' in col.lower() for col in result_df.columns):
                                     # Time series data - use line chart
+                                    x_col = result_df.columns[0]
+                                    y_cols = numeric_cols[:3]  # Use up to 3 numeric columns
+                                    fig = px.line(result_df, x=x_col, y=y_cols,
+                                                title="Time Series Analysis",
+                                                markers=True)
                                 else:
                                     # Regular data - use bar chart
+                                    x_col = result_df.columns[0]
+                                    y_cols = numeric_cols[0]
+                                    fig = px.bar(result_df, x=x_col, y=y_cols,
+                                                title="Data Visualization")
+                                # Improve figure layout
+                                fig.update_layout(
+                                    autosize=True,
+                                    width=900,
+                                    height=600,
+                                    margin=dict(l=50, r=50, b=100, t=100, pad=4),
+                                    template="plotly_white",
+                                    font=dict(size=14)
+                                )
                                 # Convert the figure to an image and encode it as base64
+                                img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
                                 encoded = base64.b64encode(img_bytes).decode("ascii")
                                 img_src = f"data:image/png;base64,{encoded}"