Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

App Files Files Community

SVashishta1 commited on Mar 10, 2025

Commit

984ec75

1 Parent(s): 2736104

Error Fix

Browse files

Files changed (1) hide show

app.py +246 -294

app.py CHANGED Viewed

@@ -300,156 +300,109 @@ def process_text_query(query, history):
                     try:
                         print("Visualization requested, attempting to create plot...")
-                        # Increase plot size
-                        fig_width = 1000  # Increased from 900
-                        fig_height = 700  # Increased from 600
-                        # Determine visualization type from query
-                        viz_type = None
-                        for vtype, keywords in viz_keywords.items():
-                            if any(keyword in query.lower() for keyword in keywords):
-                                viz_type = vtype
-                                break
-                        # If no specific type is detected, infer from data
-                        if not viz_type:
-                            if len(result_df) <= 10 and len(result_df.columns) == 2:
-                                viz_type = 'pie'  # Small dataset with 2 columns is good for pie charts
-                            elif any('date' in col.lower() or 'time' in col.lower() or 'month' in col.lower() or 'year' in col.lower() for col in result_df.columns):
-                                viz_type = 'line'  # Time-related data is good for line charts
-                            else:
-                                viz_type = 'bar'  # Default to bar chart
-                        print(f"Detected visualization type: {viz_type}")
-                        # Find numeric columns for visualization
-                        numeric_cols = result_df.select_dtypes(include=['number']).columns.tolist()
                         # Create the appropriate visualization based on type
-                        if len(numeric_cols) >= 1 and len(result_df) > 1:
-                            if viz_type == 'pie' and len(result_df) <= 20:
-                                # For pie charts, we need a category column and a value column
-                                category_col = result_df.columns[0]
-                                value_col = numeric_cols[0] if numeric_cols else result_df.columns[1]
-                                # Handle case where all columns are numeric
-                                if len(numeric_cols) == len(result_df.columns):
-                                    category_col = result_df.index.name or 'index'
-                                    result_df = result_df.reset_index()
-                                fig = px.pie(
-                                    result_df,
-                                    names=category_col,
-                                    values=value_col,
-                                    title=f"Distribution of {value_col} by {category_col}",
-                                    hole=0.3,  # Donut chart for better readability
-                                    color_discrete_sequence=px.colors.qualitative.Pastel
-                                )
-                            elif viz_type == 'histogram' and len(result_df.columns) > 0:
-                                # For histograms, we need at least one column
-                                # Find the best column for histogram (prefer numeric)
-                                if numeric_cols:
-                                    x_col = numeric_cols[0]
-                                else:
-                                    x_col = result_df.columns[0]
-                                # Check if data is already binned
-                                if len(result_df) <= 30 and ('bin' in result_df.columns or 'range' in result_df.columns):
-                                    # Data is pre-binned, use a bar chart
-                                    bin_col = 'bin' if 'bin' in result_df.columns else 'range'
-                                    count_col = 'count' if 'count' in result_df.columns else numeric_cols[0] if numeric_cols else result_df.columns[1]
-                                    fig = px.bar(
-                                        result_df,
-                                        x=bin_col,
-                                        y=count_col,
-                                        title=f"Histogram of {x_col}",
-                                        labels={bin_col: x_col, count_col: 'Frequency'},
-                                        color_discrete_sequence=['#636EFA']
-                                    )
-                                else:
-                                    # Create a proper histogram from raw data
-                                    fig = px.histogram(
-                                        result_df,
-                                        x=x_col,
-                                        title=f"Distribution of {x_col}",
-                                        nbins=20,
-                                        marginal="box",  # Add a box plot on the margin
-                                        color_discrete_sequence=['#636EFA'],
-                                        opacity=0.8,
-                                        histnorm='probability density'  # Normalize to show density instead of count
-                                    )
-                                    # Add a KDE (kernel density estimate) curve
-                                    from scipy import stats
-                                    import numpy as np
-                                    # Only add KDE if we have numeric data
-                                    if pd.api.types.is_numeric_dtype(result_df[x_col]):
-                                        # Remove NaN values
-                                        data = result_df[x_col].dropna()
-                                        if len(data) > 1:  # Need at least 2 points for KDE
-                                            # Calculate KDE
-                                            kde = stats.gaussian_kde(data)
-                                            x_range = np.linspace(data.min(), data.max(), 1000)
-                                            y_kde = kde(x_range)
-                                            # Add KDE curve
-                                            fig.add_scatter(
-                                                x=x_range,
-                                                y=y_kde,
-                                                mode='lines',
-                                                line=dict(color='red', width=2),
-                                                name='Density Curve'
-                                            )
-                                # Improve histogram layout
-                                fig.update_layout(
-                                    bargap=0.1,  # Gap between bars
-                                    xaxis_title=x_col,
-                                    yaxis_title='Frequency',
-                                    showlegend=True
-                                )
-                            elif viz_type == 'box' and numeric_cols:
-                                # For box plots, we need to handle the data differently
-                                # SQLite doesn't support window functions for percentiles
-                                # So we'll calculate the box plot statistics in Python
-                                # Get the numeric column to plot
                                 x_col = numeric_cols[0]
-                                # Create a box plot using plotly express
-                                fig = px.box(
                                     result_df,
-                                    y=x_col,
-                                    title=f"Box Plot of {x_col}",
-                                    points="outliers",  # Only show outlier points
                                     color_discrete_sequence=['#636EFA']
                                 )
-                                # Add a strip plot (individual points) on the side for better visualization
-                                fig.add_trace(
-                                    px.strip(result_df, y=x_col, color_discrete_sequence=['#FECB52']).data[0]
                                 )
-                            elif viz_type == 'heatmap' and len(numeric_cols) >= 2:
-                                # For heatmaps, we need at least 2 numeric columns
-                                # If we have many numeric columns, create a correlation matrix
-                                if len(numeric_cols) >= 3:
-                                    # Create a correlation matrix
-                                    # First, drop any rows with NaN values in numeric columns
-                                    clean_df = result_df[numeric_cols].dropna()
-                                    if len(clean_df) > 1:  # Need at least 2 rows for correlation
-                                        corr_df = clean_df.corr()
-                                        # Round to 2 decimal places for display
-                                        corr_df = corr_df.round(2)
                                     fig = px.imshow(
                                         corr_df,
@@ -459,196 +412,195 @@ def process_text_query(query, history):
                                         aspect="auto",
                                         zmin=-1, zmax=1  # Set limits for correlation values
                                     )
-                                        # Improve heatmap layout
-                                fig.update_layout(
-                                            xaxis_title="Features",
-                                            yaxis_title="Features",
-                                            coloraxis_colorbar=dict(
-                                                title="Correlation",
-                                                thicknessmode="pixels", thickness=20,
-                                                lenmode="pixels", len=300,
-                                                yanchor="top", y=1,
-                                                ticks="outside"
-                                            )
-                                        )
-                                    else:
-                                        # Not enough data for correlation
-                                        fig = px.bar(
-                                            pd.DataFrame({'Message': ['Not enough data for heatmap']}),
-                                            title="Cannot create heatmap - insufficient data"
-                                        )
-                                else:
-                                    # If we only have 2 numeric columns, create a 2D histogram
-                                    x_col = numeric_cols[0]
-                                    y_col = numeric_cols[1]
-                                    # Create a 2D histogram (heatmap)
-                                    fig = px.density_heatmap(
-                                        result_df,
-                                        x=x_col,
-                                        y=y_col,
-                                        title=f"Density Heatmap of {x_col} vs {y_col}",
-                                        color_continuous_scale='Viridis',
-                                        nbinsx=20,
-                                        nbinsy=20,
-                                        marginal_x="histogram",  # Add histograms on the margins
-                                        marginal_y="histogram"
-                                    )
                                     # Improve heatmap layout
                                     fig.update_layout(
-                                        xaxis_title=x_col,
-                                        yaxis_title=y_col,
                                         coloraxis_colorbar=dict(
-                                            title="Count",
                                             thicknessmode="pixels", thickness=20,
                                             lenmode="pixels", len=300,
                                             yanchor="top", y=1,
                                             ticks="outside"
                                         )
                                     )
-                            elif viz_type == 'scatter' and len(numeric_cols) >= 2:
-                                # For scatter plots, we need at least 2 numeric columns
                                 x_col = numeric_cols[0]
                                 y_col = numeric_cols[1]
-                                # Add a third dimension (size) if available
-                                size_col = numeric_cols[2] if len(numeric_cols) > 2 else None
-                                # Add a color dimension if available
-                                if len(result_df.columns) > len(numeric_cols):
-                                    # Find a categorical column for color
-                                    categorical_cols = [col for col in result_df.columns if col not in numeric_cols]
-                                    color_col = categorical_cols[0] if categorical_cols else None
-                                else:
-                                    color_col = None
-                                # Create scatter plot with enhanced features
-                                fig = px.scatter(
                                     result_df,
                                     x=x_col,
                                     y=y_col,
-                                    size=size_col,
-                                    color=color_col,  # Add color dimension if available
-                                    title=f"Relationship between {x_col} and {y_col}",
-                                    opacity=0.7,
-                                    size_max=15,  # Maximum marker size
-                                    color_discrete_sequence=px.colors.qualitative.Plotly
                                 )
-                                # Add a trend line
-                                if pd.api.types.is_numeric_dtype(result_df[x_col]) and pd.api.types.is_numeric_dtype(result_df[y_col]):
-                                    fig.update_layout(
-                                        shapes=[
-                                            dict(
-                                                type='line',
-                                                xref='x', yref='y',
-                                                x0=result_df[x_col].min(),
-                                                y0=result_df[y_col].min(),
-                                                x1=result_df[x_col].max(),
-                                                y1=result_df[y_col].max(),
-                                                line=dict(color='red', width=2, dash='dash')
-                                            )
-                                        ]
-                                    )
-                                # Improve scatter plot layout
                                 fig.update_layout(
                                     xaxis_title=x_col,
                                     yaxis_title=y_col,
-                                    showlegend=True,
-                                    legend=dict(
-                                        title=color_col if color_col else "",
-                                        orientation="h",
-                                        yanchor="bottom",
-                                        y=1.02,
-                                        xanchor="right",
-                                        x=1
                                     )
                                 )
-                            elif viz_type == 'line':
-                                # For line charts, determine the x-axis (preferably a date/time column)
-                                time_cols = [col for col in result_df.columns if any(time_word in col.lower()
-                                                                        for time_word in ['date', 'time', 'month', 'year', 'day'])]
-                                if time_cols:
-                                    x_col = time_cols[0]
-                                else:
-                                    x_col = result_df.columns[0]
-                                # Determine y-axis columns (numeric columns)
-                                y_cols = numeric_cols[:3]  # Use up to 3 numeric columns
-                                if not y_cols and len(result_df.columns) > 1:
-                                    # If no numeric columns, use the second column
-                                    y_cols = [result_df.columns[1]]
-                                fig = px.line(
-                                    result_df,
-                                    x=x_col,
-                                    y=y_cols,
-                                    title="Time Series Analysis",
-                                    markers=True,  # Add markers at each data point
-                                    color_discrete_sequence=px.colors.qualitative.Plotly
-                                )
-                                # Add range slider for time series
                                 fig.update_layout(
-                                    xaxis=dict(
-                                        rangeslider=dict(visible=True),
-                                        type='category' if not pd.api.types.is_datetime64_any_dtype(result_df[x_col]) else '-'
-                                    )
                                 )
-                            else:  # Default to bar chart
-                                # For bar charts, use the first column as x and numeric columns as y
-                                x_col = result_df.columns[0]
-                                # Determine y-axis columns (numeric columns)
-                                if numeric_cols and x_col not in numeric_cols:
-                                    y_cols = numeric_cols[:3]  # Use up to 3 numeric columns
-                                elif len(result_df.columns) > 1:
-                                    y_cols = [result_df.columns[1]]
-                                else:
-                                    y_cols = ['value']
-                                    result_df['value'] = 1  # Default value if no suitable column
-                                fig = px.bar(
-                                    result_df,
-                                    x=x_col,
-                                    y=y_cols[0],  # Use only the first y column for bar charts
-                                    title="Data Visualization",
-                                    color_discrete_sequence=['#636EFA']
                                 )
-                            # Improve figure layout for all chart types
                             fig.update_layout(
-                                autosize=True,
-                                width=fig_width,
-                                height=fig_height,
-                                margin=dict(l=50, r=50, b=100, t=100, pad=4),
-                                template="plotly_white",
-                                font=dict(size=14),
-                                legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
-                                plot_bgcolor='rgba(240,240,240,0.2)',  # Light gray background
-                                paper_bgcolor='white'
                             )
-                            # Convert the figure to an image and encode it as base64
-                            img_bytes = fig.to_image(format="png", width=fig_width, height=fig_height, scale=2)
-                            encoded = base64.b64encode(img_bytes).decode("ascii")
-                            img_src = f"data:image/png;base64,{encoded}"
-                            # Add the image directly to the response with increased size
-                            response += f"\n\n<img src='{img_src}' width='100%' style='min-height:700px;' />"
-                            # Add note about visualization
-                            response += f"\n\n**A {viz_type} visualization has been generated and is displayed above.**"
-                        else:
-                            print("Not enough numeric columns or data points for visualization")
                     except Exception as viz_error:
                         print(f"Visualization error: {str(viz_error)}")
                         traceback.print_exc()

                     try:
                         print("Visualization requested, attempting to create plot...")
+                        # Set common figure parameters
+                        fig_width = 1000
+                        fig_height = 700
                         # Create the appropriate visualization based on type
+                        if viz_type == 'pie' and len(result_df) <= 20:
+                            # For pie charts, we need a category column and a value column
+                            category_col = result_df.columns[0]
+                            value_col = numeric_cols[0] if numeric_cols else result_df.columns[1]
+                            # Handle case where all columns are numeric
+                            if len(numeric_cols) == len(result_df.columns):
+                                category_col = result_df.index.name or 'index'
+                                result_df = result_df.reset_index()
+                            fig = px.pie(
+                                result_df,
+                                names=category_col,
+                                values=value_col,
+                                title=f"Distribution of {value_col} by {category_col}",
+                                hole=0.3,  # Donut chart for better readability
+                                color_discrete_sequence=px.colors.qualitative.Pastel
+                            )
+                        elif viz_type == 'histogram' and len(result_df.columns) > 0:
+                            # For histograms, we need at least one column
+                            # Find the best column for histogram (prefer numeric)
+                            if numeric_cols:
                                 x_col = numeric_cols[0]
+                            else:
+                                x_col = result_df.columns[0]
+                            # Check if data is already binned
+                            if len(result_df) <= 30 and ('bin' in result_df.columns or 'range' in result_df.columns):
+                                # Data is pre-binned, use a bar chart
+                                bin_col = 'bin' if 'bin' in result_df.columns else 'range'
+                                count_col = 'count' if 'count' in result_df.columns else numeric_cols[0] if numeric_cols else result_df.columns[1]
+                                fig = px.bar(
                                     result_df,
+                                    x=bin_col,
+                                    y=count_col,
+                                    title=f"Histogram of {x_col}",
+                                    labels={bin_col: x_col, count_col: 'Frequency'},
                                     color_discrete_sequence=['#636EFA']
                                 )
+                            else:
+                                # Create a proper histogram from raw data
+                                fig = px.histogram(
+                                    result_df,
+                                    x=x_col,
+                                    title=f"Distribution of {x_col}",
+                                    nbins=20,
+                                    marginal="box",  # Add a box plot on the margin
+                                    color_discrete_sequence=['#636EFA'],
+                                    opacity=0.8
                                 )
+                            # Improve histogram layout
+                            fig.update_layout(
+                                bargap=0.1,  # Gap between bars
+                                xaxis_title=x_col,
+                                yaxis_title='Frequency',
+                                showlegend=True
+                            )
+                        elif viz_type == 'box' and numeric_cols:
+                            # For box plots, we need to handle the data differently
+                            # SQLite doesn't support window functions for percentiles
+                            # So we'll calculate the box plot statistics in Python
+                            # Get the numeric column to plot
+                            x_col = numeric_cols[0]
+                            # Create a box plot using plotly express
+                            fig = px.box(
+                                result_df,
+                                y=x_col,
+                                title=f"Box Plot of {x_col}",
+                                points="outliers",  # Only show outlier points
+                                color_discrete_sequence=['#636EFA']
+                            )
+                            # Add a strip plot (individual points) on the side for better visualization
+                            fig.add_trace(
+                                px.strip(result_df, y=x_col, color_discrete_sequence=['#FECB52']).data[0]
+                            )
+                        elif viz_type == 'heatmap' and len(numeric_cols) >= 2:
+                            # For heatmaps, we need at least 2 numeric columns
+                            # If we have many numeric columns, create a correlation matrix
+                            if len(numeric_cols) >= 3:
+                                # Create a correlation matrix
+                                # First, drop any rows with NaN values in numeric columns
+                                clean_df = result_df[numeric_cols].dropna()
+                                if len(clean_df) > 1:  # Need at least 2 rows for correlation
+                                    corr_df = clean_df.corr()
+                                    # Round to 2 decimal places for display
+                                    corr_df = corr_df.round(2)
                                     fig = px.imshow(
                                         corr_df,
                                         aspect="auto",
                                         zmin=-1, zmax=1  # Set limits for correlation values
                                     )
                                     # Improve heatmap layout
                                     fig.update_layout(
+                                        xaxis_title="Features",
+                                        yaxis_title="Features",
                                         coloraxis_colorbar=dict(
+                                            title="Correlation",
                                             thicknessmode="pixels", thickness=20,
                                             lenmode="pixels", len=300,
                                             yanchor="top", y=1,
                                             ticks="outside"
                                         )
                                     )
+                                else:
+                                    # Not enough data for correlation
+                                    fig = px.bar(
+                                        pd.DataFrame({'Message': ['Not enough data for heatmap']}),
+                                        title="Cannot create heatmap - insufficient data"
+                                    )
+                            else:
+                                # If we only have 2 numeric columns, create a 2D histogram
                                 x_col = numeric_cols[0]
                                 y_col = numeric_cols[1]
+                                # Create a 2D histogram (heatmap)
+                                fig = px.density_heatmap(
                                     result_df,
                                     x=x_col,
                                     y=y_col,
+                                    title=f"Density Heatmap of {x_col} vs {y_col}",
+                                    color_continuous_scale='Viridis',
+                                    nbinsx=20,
+                                    nbinsy=20,
+                                    marginal_x="histogram",  # Add histograms on the margins
+                                    marginal_y="histogram"
                                 )
+                                # Improve heatmap layout
                                 fig.update_layout(
                                     xaxis_title=x_col,
                                     yaxis_title=y_col,
+                                    coloraxis_colorbar=dict(
+                                        title="Count",
+                                        thicknessmode="pixels", thickness=20,
+                                        lenmode="pixels", len=300,
+                                        yanchor="top", y=1,
+                                        ticks="outside"
                                     )
                                 )
+                        elif viz_type == 'scatter' and len(numeric_cols) >= 2:
+                            # For scatter plots, we need at least 2 numeric columns
+                            x_col = numeric_cols[0]
+                            y_col = numeric_cols[1]
+                            # Add a third dimension (size) if available
+                            size_col = numeric_cols[2] if len(numeric_cols) > 2 else None
+                            # Add a color dimension if available
+                            if len(result_df.columns) > len(numeric_cols):
+                                # Find a categorical column for color
+                                categorical_cols = [col for col in result_df.columns if col not in numeric_cols]
+                                color_col = categorical_cols[0] if categorical_cols else None
+                            else:
+                                color_col = None
+                            # Create scatter plot with enhanced features
+                            fig = px.scatter(
+                                result_df,
+                                x=x_col,
+                                y=y_col,
+                                size=size_col,
+                                color=color_col,  # Add color dimension if available
+                                title=f"Relationship between {x_col} and {y_col}",
+                                opacity=0.7,
+                                size_max=15,  # Maximum marker size
+                                color_discrete_sequence=px.colors.qualitative.Plotly
+                            )
+                            # Add a trend line
+                            if pd.api.types.is_numeric_dtype(result_df[x_col]) and pd.api.types.is_numeric_dtype(result_df[y_col]):
                                 fig.update_layout(
+                                    shapes=[
+                                        dict(
+                                            type='line',
+                                            xref='x', yref='y',
+                                            x0=result_df[x_col].min(),
+                                            y0=result_df[y_col].min(),
+                                            x1=result_df[x_col].max(),
+                                            y1=result_df[y_col].max(),
+                                            line=dict(color='red', width=2, dash='dash')
+                                        )
+                                    ]
                                 )
+                            # Improve scatter plot layout
+                            fig.update_layout(
+                                xaxis_title=x_col,
+                                yaxis_title=y_col,
+                                showlegend=True,
+                                legend=dict(
+                                    title=color_col if color_col else "",
+                                    orientation="h",
+                                    yanchor="bottom",
+                                    y=1.02,
+                                    xanchor="right",
+                                    x=1
                                 )
+                            )
+                        elif viz_type == 'line':
+                            # For line charts, determine the x-axis (preferably a date/time column)
+                            time_cols = [col for col in result_df.columns if any(time_word in col.lower()
+                                                                    for time_word in ['date', 'time', 'month', 'year', 'day'])]
+                            if time_cols:
+                                x_col = time_cols[0]
+                            else:
+                                x_col = result_df.columns[0]
+                            # Determine y-axis columns (numeric columns)
+                            y_cols = numeric_cols[:3]  # Use up to 3 numeric columns
+                            if not y_cols and len(result_df.columns) > 1:
+                                # If no numeric columns, use the second column
+                                y_cols = [result_df.columns[1]]
+                            fig = px.line(
+                                result_df,
+                                x=x_col,
+                                y=y_cols,
+                                title="Time Series Analysis",
+                                markers=True,  # Add markers at each data point
+                                color_discrete_sequence=px.colors.qualitative.Plotly
+                            )
+                            # Add range slider for time series
                             fig.update_layout(
+                                xaxis=dict(
+                                    rangeslider=dict(visible=True),
+                                    type='category' if not pd.api.types.is_datetime64_any_dtype(result_df[x_col]) else '-'
+                                )
                             )
+                        else:  # Default to bar chart
+                            # For bar charts, use the first column as x and numeric columns as y
+                            x_col = result_df.columns[0]
+                            # Determine y-axis columns (numeric columns)
+                            if numeric_cols and x_col not in numeric_cols:
+                                y_cols = numeric_cols[:3]  # Use up to 3 numeric columns
+                            elif len(result_df.columns) > 1:
+                                y_cols = [result_df.columns[1]]
+                            else:
+                                y_cols = ['value']
+                                result_df['value'] = 1  # Default value if no suitable column
+                            fig = px.bar(
+                                result_df,
+                                x=x_col,
+                                y=y_cols[0],  # Use only the first y column for bar charts
+                                title="Data Visualization",
+                                color_discrete_sequence=['#636EFA']
+                            )
+                        # Improve figure layout for all chart types
+                        fig.update_layout(
+                            autosize=True,
+                            width=fig_width,
+                            height=fig_height,
+                            margin=dict(l=50, r=50, b=100, t=100, pad=4),
+                            template="plotly_white",
+                            font=dict(size=14),
+                            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+                            plot_bgcolor='rgba(240,240,240,0.2)',  # Light gray background
+                            paper_bgcolor='white'
+                        )
+                        # Convert the figure to an image and encode it as base64
+                        img_bytes = fig.to_image(format="png", width=fig_width, height=fig_height, scale=2)
+                        encoded = base64.b64encode(img_bytes).decode("ascii")
+                        img_src = f"data:image/png;base64,{encoded}"
+                        # Add the image directly to the response with increased size
+                        response += f"\n\n<img src='{img_src}' width='100%' style='min-height:700px;' />"
+                        # Add note about visualization
+                        response += f"\n\n**A {viz_type} visualization has been generated and is displayed above.**"
                     except Exception as viz_error:
                         print(f"Visualization error: {str(viz_error)}")
                         traceback.print_exc()