Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

App Files Files Community

SVashishta1 commited on Mar 3

Commit

5facdeb

1 Parent(s): 92d1d2a

Error Fix

Browse files

Files changed (1) hide show

app.py +137 -116

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import tempfile
 import pandas as pd
 import sqlite3
 from langchain_core.prompts import ChatPromptTemplate
 # Load environment variables
 load_dotenv()
@@ -36,59 +38,36 @@ os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
 # Define the prompt with examples
 query_prompt = ChatPromptTemplate.from_messages([
-    ("system", """
-            You are an SQL and data analysis expert. Generate an appropriate SQL query using SQLite syntax for the question provided, without any explanations or code comments.
-            Follow SQLite-specific conventions, as shown in the examples below:
-            Example 1:
-            Question: "What is the average fare for trips over 10 miles?"
-            SQL Query: SELECT AVG(fare_amount) FROM taxi_data WHERE trip_distance > 10;
-            Example 2:
-            Question: "How many trips were taken in each month?"
-            SQL Query: SELECT strftime('%m', pickup_datetime) AS month, COUNT(*) AS trip_count FROM taxi_data GROUP BY month;
-            Example 3:
-            Question: "What is the total fare amount for each driver (medallion) per day?"
-            SQL Query: SELECT DATE(pickup_datetime) AS date, medallion, SUM(fare_amount) AS total_fare FROM taxi_data GROUP BY date, medallion;
-            SQLite-Specific Conventions:
-            1. Date and Time Extraction:
-               - Instead of `EXTRACT(YEAR FROM column)`, use `strftime('%Y', column)` to extract the year.
-               - Example: `SELECT strftime('%Y', pickup_datetime) FROM taxi_data;`
-            2. String Length:
-               - Instead of `CHAR_LENGTH(column)`, use `LENGTH(column)`.
-               - Example: `SELECT LENGTH(passenger_name) FROM taxi_data;`
-            3. Regular Expressions:
-               - SQLite does not support `REGEXP`. Use `LIKE` for simple patterns or avoid regular expressions.
-               - Example: `SELECT * FROM taxi_data WHERE passenger_name LIKE 'A%';`
-            4. Window Functions:
-               - For row numbering, use `ROW_NUMBER()` if supported, or simulate with joins.
-               - Example: `SELECT id, ROW_NUMBER() OVER (ORDER BY pickup_datetime) AS row_num FROM taxi_data;`
-            5. Data Type Casting:
-               - Use `CAST(column AS TYPE)`, but note that SQLite supports limited types.
-               - Example: `SELECT CAST(fare_amount AS INTEGER) FROM taxi_data;`
-            6. Full Outer Join Workaround:
-               - SQLite doesn’t support `FULL OUTER JOIN`. Combine `LEFT JOIN` and `UNION` for a similar effect.
-               - Example:
-                 ```
-                 SELECT a.*, b.*
-                 FROM table_a a
-                 LEFT JOIN table_b b ON a.id = b.id
-                 UNION
-                 SELECT a.*, b.*
-                 FROM table_a a
-                 RIGHT JOIN table_b b ON a.id = b.id;
-                 ```
-            Use these examples and guidelines to generate an SQL query compatible with SQLite syntax for the question provided.
-        """),
     ("human", "{question}")
 ])
@@ -100,88 +79,88 @@ interpret_prompt = ChatPromptTemplate.from_messages(
     ]
 )
 def process_text_query(query, history):
     """Process a text query and update chat history"""
     if not query:
         return "", history
-    # First, check if we have any CSV data loaded
     try:
-        conn = sqlite3.connect(DB_PATH)
-        cursor = conn.cursor()
-        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
-        tables = [row[0] for row in cursor.fetchall()]
-        if tables:
-            # Get table schema information
-            table_info = []
-            for table in tables:
-                cursor.execute(f"PRAGMA table_info({table});")
-                columns = [f"{col[1]} ({col[2]})" for col in cursor.fetchall()]
-                table_info.append(f"Table '{table}' has columns: {', '.join(columns)}")
-            # For questions about specific values, aggregations, or data analysis
-            if any(word in query.lower() for word in [
-                'what is', 'how many', 'highest', 'lowest', 'maximum', 'minimum',
-                'average', 'mean', 'sum', 'total', 'count', 'tip', 'fare', 'amount'
-            ]):
                 try:
-                    # Generate SQL query
-                    context = f"The database contains the following tables:\n{chr(10).join(table_info)}\n\nQuestion: {query}"
-                    sql_query = query_engine.generate_response(query_prompt.format(question=context))
-                    # Execute query
                     result_df = pd.read_sql_query(sql_query, conn)
-                    # Format results
-                    if len(result_df) > 10:
-                        data_str = f"{result_df.head(10).to_string()}\n... (showing 10 of {len(result_df)} rows)"
                     else:
-                        data_str = result_df.to_string()
-                    # Generate response
-                    response = f"**SQL Query:**\n```sql\n{sql_query}\n```\n\n"
-                    if not result_df.empty:
-                        response += f"**Results:**\n```\n{data_str}\n```\n\n"
-                        # Add interpretation
-                        interpret_prompt = f"""
-                        Question: {query}
-                        SQL Query: {sql_query}
-                        Results: {data_str}
-                        Please provide a clear, concise answer to the question based on these results.
-                        """
-                        interpretation = query_engine.generate_response(interpret_prompt)
-                        response += f"**Answer:**\n{interpretation}"
-                    else:
-                        response += "No results found."
-                    history.append({"role": "user", "content": query})
-                    history.append({"role": "assistant", "content": response})
-                    return "", history
                 except Exception as e:
-                    print(f"SQL Error: {str(e)}")
-                    # Fall back to document query if SQL fails
-                    response = document_assistant.process_query(query)
             else:
-                # For non-data analysis questions, use document query
-                response = document_assistant.process_query(query)
-        else:
-            # No tables found, use document query
-            response = document_assistant.process_query(query)
-        conn.close()
     except Exception as e:
-        print(f"Database Error: {str(e)}")
-        # Fall back to document query if database access fails
-        response = document_assistant.process_query(query)
-    # Update history
     history.append({"role": "user", "content": query})
     history.append({"role": "assistant", "content": response})
     return "", history
 def process_file_upload(files):
@@ -189,6 +168,15 @@ def process_file_upload(files):
     if not files:
         return "No files uploaded"
     file_info = []
     for file in files:
         file_path = file.name
@@ -196,16 +184,22 @@ def process_file_upload(files):
         file_ext = os.path.splitext(file_name)[1].lower()
         if file_ext == '.csv':
-            # Special handling for CSV files - load into SQLite
             try:
-                # Create table name from filename (remove extension, replace spaces with underscores)
                 table_name = os.path.splitext(file_name)[0].replace(' ', '_').lower()
                 # Load CSV into SQLite
                 conn = sqlite3.connect(DB_PATH)
                 load_csv_to_sqlite(file_path, conn, table_name)
-                # Get column info for the table
                 cursor = conn.cursor()
                 cursor.execute(f"PRAGMA table_info({table_name});")
                 columns = [f"{col[1]} ({col[2]})" for col in cursor.fetchall()]
@@ -220,15 +214,24 @@ def process_file_upload(files):
                 file_info.append(f"Columns: {', '.join(columns)}")
                 file_info.append(f"Rows: {row_count}")
-                # Also index with document assistant for text search
-                result = document_assistant.upload_document(file_path)
-                file_info.append(f"Also indexed for text search: {result['message']}")
             except Exception as e:
                 file_info.append(f"Error loading CSV {file_name}: {str(e)}")
         else:
-            # Process and index the document
-            result = document_assistant.upload_document(file_path)
-            file_info.append(f"{result['message']} ({result['chunks']} chunks)")
     return "\n".join(file_info)
@@ -311,6 +314,16 @@ def list_documents():
     return "\n".join(doc_list)
 # Create Gradio interface
 with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
     gr.Markdown("# 🤖 AI Document Analysis & Voice Assistant")
@@ -331,6 +344,7 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
         with gr.Row():
             submit_btn = gr.Button("Submit")
             clear_btn = gr.Button("Clear")
         audio_output = gr.Audio(label="Voice Response", type="filepath")
@@ -375,6 +389,13 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
             inputs=[chatbot],
             outputs=[audio_output]
         )
     with gr.Tab("Document Upload"):
         file_upload = gr.File(

 import pandas as pd
 import sqlite3
 from langchain_core.prompts import ChatPromptTemplate
+import plotly.express as px
+import plotly.io as pio
 # Load environment variables
 load_dotenv()
 # Define the prompt with examples
 query_prompt = ChatPromptTemplate.from_messages([
+    ("system", """You are an SQL expert. Generate an appropriate SQL query using SQLite syntax for the question provided. The query should be executable and return exactly what was asked for.
+For questions about maximum/highest values, use MAX().
+For minimum/lowest values, use MIN().
+For averages, use AVG().
+For counts, use COUNT().
+For sums, use SUM().
+For visualization queries:
+1. For trends over time:
+   - Group by appropriate time unit (day, month, year)
+   - Include relevant aggregations (AVG, COUNT, SUM)
+2. For distributions:
+   - Group by the value being distributed
+   - Include COUNT or frequency
+3. For comparisons:
+   - Include multiple measures
+   - Order appropriately
+Examples:
+1. Question: "Plot tip amount trends by month"
+   SQL: SELECT strftime('%Y-%m', pickup_datetime) as month, AVG(tip_amount) as avg_tip, COUNT(*) as count FROM data_tab GROUP BY month ORDER BY month;
+2. Question: "Show distribution of fare amounts"
+   SQL: SELECT fare_amount, COUNT(*) as frequency FROM data_tab GROUP BY fare_amount ORDER BY fare_amount;
+3. Question: "What is the highest tip_amount in the dataset?"
+   SQL: SELECT MAX(tip_amount) as highest_tip FROM data_tab;
+Generate only the SQL query, nothing else. Make sure to use the correct table name from the context provided."""),
     ("human", "{question}")
 ])
     ]
 )
+# Add this as a global variable to track current context
+current_context = {
+    "file_type": None,  # 'csv' or 'pdf' or None
+    "file_name": None,
+    "table_name": None
+}
 def process_text_query(query, history):
     """Process a text query and update chat history"""
     if not query:
         return "", history
+    # Check if query is about visualization
+    is_plot_query = any(word in query.lower() for word in [
+        'plot', 'graph', 'chart', 'visualize', 'visualization', 'trend', 'trends'
+    ])
     try:
+        if current_context["file_type"] == "csv":
+            conn = sqlite3.connect(DB_PATH)
+            cursor = conn.cursor()
+            if is_plot_query:
                 try:
+                    # For visualization queries, we need to get appropriate data
+                    if 'trend' in query.lower():
+                        # Example: For trend analysis, group by appropriate time unit
+                        sql_query = f"""
+                        SELECT strftime('%Y-%m', pickup_datetime) as month,
+                               AVG(tip_amount) as avg_tip,
+                               COUNT(*) as count,
+                               SUM(tip_amount) as total_tip
+                        FROM {current_context['table_name']}
+                        GROUP BY month
+                        ORDER BY month;
+                        """
+                    else:
+                        # Default to a general aggregation
+                        sql_query = f"""
+                        SELECT tip_amount, COUNT(*) as frequency
+                        FROM {current_context['table_name']}
+                        GROUP BY tip_amount
+                        ORDER BY tip_amount;
+                        """
+                    # Execute query and create visualization
                     result_df = pd.read_sql_query(sql_query, conn)
+                    if 'trend' in query.lower():
+                        fig = px.line(result_df, x='month', y=['avg_tip', 'total_tip'],
+                                    title='Tip Trends Over Time')
                     else:
+                        fig = px.bar(result_df, x='tip_amount', y='frequency',
+                                   title='Distribution of Tip Amounts')
+                    # Convert plot to HTML
+                    plot_html = fig.to_html(full_html=False, include_plotlyjs='cdn')
+                    response = f"**Analysis:**\n\nHere's the visualization of the data:\n\n<div>{plot_html}</div>"
                 except Exception as e:
+                    response = f"Error creating visualization: {str(e)}"
             else:
+                # Handle regular SQL queries as before
+                # ... (keep your existing SQL query handling code here)
+                pass
+            conn.close()
+        elif current_context["file_type"] == "pdf":
+            # Process PDF queries using document_assistant
+            response = document_assistant.process_query(query)
+        else:
+            response = "Please upload a file first."
     except Exception as e:
+        response = f"Error processing query: {str(e)}"
+    # Update history with message format
     history.append({"role": "user", "content": query})
     history.append({"role": "assistant", "content": response})
     return "", history
 def process_file_upload(files):
     if not files:
         return "No files uploaded"
+    global current_context
+    # Clear existing context
+    current_context = {
+        "file_type": None,
+        "file_name": None,
+        "table_name": None
+    }
     file_info = []
     for file in files:
         file_path = file.name
         file_ext = os.path.splitext(file_name)[1].lower()
         if file_ext == '.csv':
             try:
+                # Create table name from filename
                 table_name = os.path.splitext(file_name)[0].replace(' ', '_').lower()
                 # Load CSV into SQLite
                 conn = sqlite3.connect(DB_PATH)
                 load_csv_to_sqlite(file_path, conn, table_name)
+                # Update current context
+                current_context = {
+                    "file_type": "csv",
+                    "file_name": file_name,
+                    "table_name": table_name
+                }
+                # Get column info
                 cursor = conn.cursor()
                 cursor.execute(f"PRAGMA table_info({table_name});")
                 columns = [f"{col[1]} ({col[2]})" for col in cursor.fetchall()]
                 file_info.append(f"Columns: {', '.join(columns)}")
                 file_info.append(f"Rows: {row_count}")
             except Exception as e:
                 file_info.append(f"Error loading CSV {file_name}: {str(e)}")
         else:
+            # Process PDF or other document types
+            try:
+                result = document_assistant.upload_document(file_path)
+                # Update current context
+                current_context = {
+                    "file_type": "pdf",
+                    "file_name": file_name,
+                    "table_name": None
+                }
+                file_info.append(f"{result['message']} ({result['chunks']} chunks)")
+            except Exception as e:
+                file_info.append(f"Error processing document {file_name}: {str(e)}")
     return "\n".join(file_info)
     return "\n".join(doc_list)
+def clear_context():
+    """Clear the current context and chat history"""
+    global current_context
+    current_context = {
+        "file_type": None,
+        "file_name": None,
+        "table_name": None
+    }
+    return None
 # Create Gradio interface
 with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
     gr.Markdown("# 🤖 AI Document Analysis & Voice Assistant")
         with gr.Row():
             submit_btn = gr.Button("Submit")
             clear_btn = gr.Button("Clear")
+            clear_context_btn = gr.Button("Clear Context")
         audio_output = gr.Audio(label="Voice Response", type="filepath")
             inputs=[chatbot],
             outputs=[audio_output]
         )
+        # Add event handler for clear context button
+        clear_context_btn.click(
+            clear_context,
+            inputs=[],
+            outputs=[chatbot]
+        )
     with gr.Tab("Document Upload"):
         file_upload = gr.File(