Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

App Files Files Community

SVashishta1 commited on Mar 10

Commit

b0db292

1 Parent(s): 028022d

Error Fix

Browse files

Files changed (1) hide show

app.py +134 -300

app.py CHANGED Viewed

@@ -299,311 +299,21 @@ def process_text_query(query, history):
                 # Add visualization if requested
                 if is_visualization and not result_df.empty:
                     try:
-                        print("Visualization requested, attempting to create plot...")
-                        # Set common figure parameters
-                        fig_width = 1000
-                        fig_height = 700
-                        # Create the appropriate visualization based on type
-                        if viz_type == 'pie' and len(result_df) <= 20:
-                            # For pie charts, we need a category column and a value column
-                            category_col = result_df.columns[0]
-                            value_col = numeric_cols[0] if numeric_cols else result_df.columns[1]
-                            # Handle case where all columns are numeric
-                            if len(numeric_cols) == len(result_df.columns):
-                                category_col = result_df.index.name or 'index'
-                                result_df = result_df.reset_index()
-                            fig = px.pie(
-                                result_df,
-                                names=category_col,
-                                values=value_col,
-                                title=f"Distribution of {value_col} by {category_col}",
-                                hole=0.3,  # Donut chart for better readability
-                                color_discrete_sequence=px.colors.qualitative.Pastel
-                            )
-                        elif viz_type == 'histogram' and len(result_df.columns) > 0:
-                            # For histograms, we need at least one column
-                            # Find the best column for histogram (prefer numeric)
-                            if numeric_cols:
-                                x_col = numeric_cols[0]
-                            else:
-                                x_col = result_df.columns[0]
-                            # Check if data is already binned
-                            if len(result_df) <= 30 and ('bin' in result_df.columns or 'range' in result_df.columns):
-                                # Data is pre-binned, use a bar chart
-                                bin_col = 'bin' if 'bin' in result_df.columns else 'range'
-                                count_col = 'count' if 'count' in result_df.columns else numeric_cols[0] if numeric_cols else result_df.columns[1]
-                                fig = px.bar(
-                                    result_df,
-                                    x=bin_col,
-                                    y=count_col,
-                                    title=f"Histogram of {x_col}",
-                                    labels={bin_col: x_col, count_col: 'Frequency'},
-                                    color_discrete_sequence=['#636EFA']
-                                )
-                            else:
-                                # Create a proper histogram from raw data
-                                fig = px.histogram(
-                                    result_df,
-                                    x=x_col,
-                                    title=f"Distribution of {x_col}",
-                                    nbins=20,
-                                    marginal="box",  # Add a box plot on the margin
-                                    color_discrete_sequence=['#636EFA'],
-                                    opacity=0.8
-                                )
-                            # Improve histogram layout
-                            fig.update_layout(
-                                bargap=0.1,  # Gap between bars
-                                xaxis_title=x_col,
-                                yaxis_title='Frequency',
-                                showlegend=True
-                            )
-                        elif viz_type == 'box' and numeric_cols:
-                            # For box plots, we need to handle the data differently
-                            # SQLite doesn't support window functions for percentiles
-                            # So we'll calculate the box plot statistics in Python
-                            # Get the numeric column to plot
-                            x_col = numeric_cols[0]
-                            # Create a box plot using plotly express
-                            fig = px.box(
-                                result_df,
-                                y=x_col,
-                                title=f"Box Plot of {x_col}",
-                                points="outliers",  # Only show outlier points
-                                color_discrete_sequence=['#636EFA']
-                            )
-                            # Add a strip plot (individual points) on the side for better visualization
-                            fig.add_trace(
-                                px.strip(result_df, y=x_col, color_discrete_sequence=['#FECB52']).data[0]
-                            )
-                        elif viz_type == 'heatmap' and len(numeric_cols) >= 2:
-                            # For heatmaps, we need at least 2 numeric columns
-                            # If we have many numeric columns, create a correlation matrix
-                            if len(numeric_cols) >= 3:
-                                # Create a correlation matrix
-                                # First, drop any rows with NaN values in numeric columns
-                                clean_df = result_df[numeric_cols].dropna()
-                                if len(clean_df) > 1:  # Need at least 2 rows for correlation
-                                    corr_df = clean_df.corr()
-                                    # Round to 2 decimal places for display
-                                    corr_df = corr_df.round(2)
-                                    fig = px.imshow(
-                                        corr_df,
-                                        title="Correlation Heatmap",
-                                        color_continuous_scale='RdBu_r',
-                                        text_auto=True,  # Show correlation values
-                                        aspect="auto",
-                                        zmin=-1, zmax=1  # Set limits for correlation values
-                                    )
-                                    # Improve heatmap layout
-                                    fig.update_layout(
-                                        xaxis_title="Features",
-                                        yaxis_title="Features",
-                                        coloraxis_colorbar=dict(
-                                            title="Correlation",
-                                            thicknessmode="pixels", thickness=20,
-                                            lenmode="pixels", len=300,
-                                            yanchor="top", y=1,
-                                            ticks="outside"
-                                        )
-                                    )
-                                else:
-                                    # Not enough data for correlation
-                                    fig = px.bar(
-                                        pd.DataFrame({'Message': ['Not enough data for heatmap']}),
-                                        title="Cannot create heatmap - insufficient data"
-                                    )
-                            else:
-                                # If we only have 2 numeric columns, create a 2D histogram
-                                x_col = numeric_cols[0]
-                                y_col = numeric_cols[1]
-                                # Create a 2D histogram (heatmap)
-                                fig = px.density_heatmap(
-                                    result_df,
-                                    x=x_col,
-                                    y=y_col,
-                                    title=f"Density Heatmap of {x_col} vs {y_col}",
-                                    color_continuous_scale='Viridis',
-                                    nbinsx=20,
-                                    nbinsy=20,
-                                    marginal_x="histogram",  # Add histograms on the margins
-                                    marginal_y="histogram"
-                                )
-                                # Improve heatmap layout
-                                fig.update_layout(
-                                    xaxis_title=x_col,
-                                    yaxis_title=y_col,
-                                    coloraxis_colorbar=dict(
-                                        title="Count",
-                                        thicknessmode="pixels", thickness=20,
-                                        lenmode="pixels", len=300,
-                                        yanchor="top", y=1,
-                                        ticks="outside"
-                                    )
-                                )
-                        elif viz_type == 'scatter' and len(numeric_cols) >= 2:
-                            # For scatter plots, we need at least 2 numeric columns
-                            x_col = numeric_cols[0]
-                            y_col = numeric_cols[1]
-                            # Add a third dimension (size) if available
-                            size_col = numeric_cols[2] if len(numeric_cols) > 2 else None
-                            # Add a color dimension if available
-                            if len(result_df.columns) > len(numeric_cols):
-                                # Find a categorical column for color
-                                categorical_cols = [col for col in result_df.columns if col not in numeric_cols]
-                                color_col = categorical_cols[0] if categorical_cols else None
-                            else:
-                                color_col = None
-                            # Create scatter plot with enhanced features
-                            fig = px.scatter(
-                                result_df,
-                                x=x_col,
-                                y=y_col,
-                                size=size_col,
-                                color=color_col,  # Add color dimension if available
-                                title=f"Relationship between {x_col} and {y_col}",
-                                opacity=0.7,
-                                size_max=15,  # Maximum marker size
-                                color_discrete_sequence=px.colors.qualitative.Plotly
-                            )
-                            # Add a trend line
-                            if pd.api.types.is_numeric_dtype(result_df[x_col]) and pd.api.types.is_numeric_dtype(result_df[y_col]):
-                                fig.update_layout(
-                                    shapes=[
-                                        dict(
-                                            type='line',
-                                            xref='x', yref='y',
-                                            x0=result_df[x_col].min(),
-                                            y0=result_df[y_col].min(),
-                                            x1=result_df[x_col].max(),
-                                            y1=result_df[y_col].max(),
-                                            line=dict(color='red', width=2, dash='dash')
-                                        )
-                                    ]
-                                )
-                            # Improve scatter plot layout
-                            fig.update_layout(
-                                xaxis_title=x_col,
-                                yaxis_title=y_col,
-                                showlegend=True,
-                                legend=dict(
-                                    title=color_col if color_col else "",
-                                    orientation="h",
-                                    yanchor="bottom",
-                                    y=1.02,
-                                    xanchor="right",
-                                    x=1
-                                )
-                            )
-                        elif viz_type == 'line':
-                            # For line charts, determine the x-axis (preferably a date/time column)
-                            time_cols = [col for col in result_df.columns if any(time_word in col.lower()
-                                                                    for time_word in ['date', 'time', 'month', 'year', 'day'])]
-                            if time_cols:
-                                x_col = time_cols[0]
-                            else:
-                                x_col = result_df.columns[0]
-                            # Determine y-axis columns (numeric columns)
-                            y_cols = numeric_cols[:3]  # Use up to 3 numeric columns
-                            if not y_cols and len(result_df.columns) > 1:
-                                # If no numeric columns, use the second column
-                                y_cols = [result_df.columns[1]]
-                            fig = px.line(
-                                result_df,
-                                x=x_col,
-                                y=y_cols,
-                                title="Time Series Analysis",
-                                markers=True,  # Add markers at each data point
-                                color_discrete_sequence=px.colors.qualitative.Plotly
-                            )
-                            # Add range slider for time series
-                            fig.update_layout(
-                                xaxis=dict(
-                                    rangeslider=dict(visible=True),
-                                    type='category' if not pd.api.types.is_datetime64_any_dtype(result_df[x_col]) else '-'
-                                )
-                            )
-                        else:  # Default to bar chart
-                            # For bar charts, use the first column as x and numeric columns as y
-                            x_col = result_df.columns[0]
-                            # Determine y-axis columns (numeric columns)
-                            if numeric_cols and x_col not in numeric_cols:
-                                y_cols = numeric_cols[:3]  # Use up to 3 numeric columns
-                            elif len(result_df.columns) > 1:
-                                y_cols = [result_df.columns[1]]
-                            else:
-                                y_cols = ['value']
-                                result_df['value'] = 1  # Default value if no suitable column
-                            fig = px.bar(
-                                result_df,
-                                x=x_col,
-                                y=y_cols[0],  # Use only the first y column for bar charts
-                                title="Data Visualization",
-                                color_discrete_sequence=['#636EFA']
-                            )
-                        # Improve figure layout for all chart types
-                        fig.update_layout(
-                            autosize=True,
-                            width=fig_width,
-                            height=fig_height,
-                            margin=dict(l=50, r=50, b=100, t=100, pad=4),
-                            template="plotly_white",
-                            font=dict(size=14),
-                            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
-                            plot_bgcolor='rgba(240,240,240,0.2)',  # Light gray background
-                            paper_bgcolor='white'
-                        )
-                        # Convert the figure to an image and encode it as base64
-                        img_bytes = fig.to_image(format="png", width=fig_width, height=fig_height, scale=2)
-                        encoded = base64.b64encode(img_bytes).decode("ascii")
-                        img_src = f"data:image/png;base64,{encoded}"
-                        # Add the image directly to the response with increased size
-                        response += f"\n\n<img src='{img_src}' width='100%' style='min-height:700px;' />"
-                        # Add note about visualization
-                        response += f"\n\n**A {viz_type} visualization has been generated and is displayed above.**"
                     except Exception as viz_error:
                         print(f"Visualization error: {str(viz_error)}")
                         traceback.print_exc()
             except Exception as e:
@@ -910,6 +620,130 @@ except NameError as e:
         importlib.reload(backend.vector_db)
         from backend.vector_db import ChromaVectorDB
 # Create Gradio interface
 with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
     gr.Markdown("# 🤖 AI Document Analysis & Voice Assistant")

                 # Add visualization if requested
                 if is_visualization and not result_df.empty:
                     try:
+                        # Generate visualization
+                        viz_html = generate_visualization(result_df, query)
+                        if viz_html:
+                            # Add the visualization to the response
+                            response += f"\n\n{viz_html}"
+                            # Add note about visualization
+                            response += "\n\n**A visualization has been generated and is displayed above.**"
+                        else:
+                            response += "\n\n**Could not generate visualization due to an error.**"
                     except Exception as viz_error:
                         print(f"Visualization error: {str(viz_error)}")
+                        import traceback
                         traceback.print_exc()
             except Exception as e:
         importlib.reload(backend.vector_db)
         from backend.vector_db import ChromaVectorDB
+# Add this function to app.py
+def generate_visualization(result_df, query):
+    """Generate a visualization based on the query and data"""
+    try:
+        print("Visualization requested, attempting to create plot...")
+        # Set common figure parameters
+        fig_width = 1000
+        fig_height = 700
+        # Determine visualization type from query
+        viz_type = 'bar'  # Default
+        if any(word in query.lower() for word in ['pie', 'distribution', 'proportion']):
+            viz_type = 'pie'
+        elif any(word in query.lower() for word in ['line', 'trend', 'time series']):
+            viz_type = 'line'
+        elif any(word in query.lower() for word in ['scatter', 'relationship']):
+            viz_type = 'scatter'
+        elif any(word in query.lower() for word in ['histogram', 'distribution of']):
+            viz_type = 'histogram'
+        elif any(word in query.lower() for word in ['box', 'boxplot', 'outliers']):
+            viz_type = 'box'
+        elif any(word in query.lower() for word in ['heatmap', 'correlation']):
+            viz_type = 'heatmap'
+        print(f"Creating {viz_type} visualization...")
+        # Find numeric columns
+        numeric_cols = result_df.select_dtypes(include=['number']).columns.tolist()
+        # Create basic visualization based on type
+        if viz_type == 'pie' and len(result_df) <= 20:
+            # Simple pie chart
+            labels = result_df.iloc[:, 0].tolist()
+            values = result_df.iloc[:, 1].tolist() if len(result_df.columns) > 1 else [1] * len(result_df)
+            import plotly.graph_objects as go
+            fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
+            fig.update_layout(title_text='Pie Chart')
+        elif viz_type == 'histogram' and len(numeric_cols) > 0:
+            # Simple histogram
+            import plotly.express as px
+            fig = px.histogram(result_df, x=numeric_cols[0])
+            fig.update_layout(title_text=f'Histogram of {numeric_cols[0]}')
+        elif viz_type == 'box' and len(numeric_cols) > 0:
+            # Simple box plot
+            import plotly.express as px
+            fig = px.box(result_df, y=numeric_cols[0])
+            fig.update_layout(title_text=f'Box Plot of {numeric_cols[0]}')
+        elif viz_type == 'heatmap' and len(numeric_cols) >= 2:
+            # Simple heatmap
+            import plotly.express as px
+            # Create correlation matrix
+            corr_df = result_df[numeric_cols].corr()
+            fig = px.imshow(corr_df, text_auto=True)
+            fig.update_layout(title_text='Correlation Heatmap')
+        elif viz_type == 'scatter' and len(numeric_cols) >= 2:
+            # Simple scatter plot
+            import plotly.express as px
+            fig = px.scatter(result_df, x=numeric_cols[0], y=numeric_cols[1])
+            fig.update_layout(title_text=f'Scatter Plot of {numeric_cols[0]} vs {numeric_cols[1]}')
+        elif viz_type == 'line':
+            # Simple line chart
+            import plotly.express as px
+            x_col = result_df.columns[0]
+            y_cols = numeric_cols if numeric_cols else [result_df.columns[1]] if len(result_df.columns) > 1 else None
+            if y_cols:
+                fig = px.line(result_df, x=x_col, y=y_cols[0])
+                fig.update_layout(title_text=f'Line Chart of {y_cols[0]} over {x_col}')
+            else:
+                # Fallback to bar chart
+                viz_type = 'bar'
+        if viz_type == 'bar' or 'fig' not in locals():
+            # Simple bar chart (default)
+            import plotly.express as px
+            x_col = result_df.columns[0]
+            y_col = numeric_cols[0] if numeric_cols else result_df.columns[1] if len(result_df.columns) > 1 else None
+            if y_col:
+                fig = px.bar(result_df, x=x_col, y=y_col)
+                fig.update_layout(title_text=f'Bar Chart of {y_col} by {x_col}')
+            else:
+                fig = px.bar(result_df, x=x_col)
+                fig.update_layout(title_text=f'Bar Chart of {x_col}')
+        # Set common layout properties
+        fig.update_layout(
+            width=fig_width,
+            height=fig_height,
+            template="plotly_white"
+        )
+        print(f"Created figure with width={fig_width}, height={fig_height}")
+        # Convert to image
+        print("Converting figure to image...")
+        import plotly.io as pio
+        img_bytes = pio.to_image(fig, format="png", width=fig_width, height=fig_height, scale=2)
+        print("Image conversion successful")
+        # Encode as base64
+        import base64
+        encoded = base64.b64encode(img_bytes).decode("ascii")
+        img_src = f"data:image/png;base64,{encoded}"
+        print("HTML conversion successful")
+        # Return the HTML img tag
+        return f"<img src='{img_src}' width='100%' style='min-height:700px;' />"
+    except Exception as e:
+        import traceback
+        print(f"Error generating visualization: {str(e)}")
+        traceback.print_exc()
+        return None
 # Create Gradio interface
 with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
     gr.Markdown("# 🤖 AI Document Analysis & Voice Assistant")