SVashishta1
commited on
Commit
Β·
6e54ca7
1
Parent(s):
f35c7b5
Error Fix
Browse files
app.py
CHANGED
|
@@ -6,10 +6,9 @@ import tempfile
|
|
| 6 |
import pandas as pd
|
| 7 |
import sqlite3
|
| 8 |
from langchain_core.prompts import ChatPromptTemplate
|
|
|
|
| 9 |
import plotly.express as px
|
| 10 |
-
import plotly.io as pio
|
| 11 |
import time
|
| 12 |
-
from functools import lru_cache
|
| 13 |
|
| 14 |
# Load environment variables
|
| 15 |
load_dotenv()
|
|
@@ -18,164 +17,210 @@ load_dotenv()
|
|
| 18 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 19 |
|
| 20 |
from backend.main import DocumentAssistant
|
| 21 |
-
from backend.db import SimpleDB
|
| 22 |
-
from backend.vector_db import ChromaVectorDB
|
| 23 |
-
from backend.query_engine import QueryEngine
|
| 24 |
-
from backend.document_parser import SimpleDocumentParser
|
| 25 |
-
|
| 26 |
-
# Initialize components
|
| 27 |
-
db = SimpleDB()
|
| 28 |
-
vector_db = ChromaVectorDB(os.getenv("CHROMA_DB_PATH", "./data/chroma_db"))
|
| 29 |
-
query_engine = QueryEngine()
|
| 30 |
|
| 31 |
-
# Initialize the document
|
| 32 |
-
document_parser = SimpleDocumentParser()
|
| 33 |
-
|
| 34 |
-
# Initialize DocumentAssistant
|
| 35 |
document_assistant = DocumentAssistant()
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# Database path for CSV data
|
| 38 |
DB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "csv_data.db")
|
| 39 |
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
For counts, use COUNT().
|
| 49 |
-
For sums, use SUM().
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
- Include relevant aggregations (AVG, COUNT, SUM)
|
| 55 |
-
2. For distributions:
|
| 56 |
-
- Group by the value being distributed
|
| 57 |
-
- Include COUNT or frequency
|
| 58 |
-
3. For comparisons:
|
| 59 |
-
- Include multiple measures
|
| 60 |
-
- Order appropriately
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
|
|
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
# Define the prompt for interpreting the SQL query result
|
| 77 |
interpret_prompt = ChatPromptTemplate.from_messages(
|
| 78 |
[
|
| 79 |
-
("system", "You are an experienced data analyst.
|
| 80 |
-
("human", "Question: {question}\
|
| 81 |
]
|
| 82 |
)
|
| 83 |
|
| 84 |
-
# Add this as a global variable to track current context
|
| 85 |
-
current_context = {
|
| 86 |
-
"file_type": None, # 'csv' or 'pdf' or None
|
| 87 |
-
"file_name": None,
|
| 88 |
-
"table_name": None
|
| 89 |
-
}
|
| 90 |
-
|
| 91 |
-
# Add a simple cache for database schema information
|
| 92 |
-
@lru_cache(maxsize=32)
|
| 93 |
-
def get_table_info(table_name):
|
| 94 |
-
"""Get cached table information"""
|
| 95 |
-
conn = sqlite3.connect(DB_PATH)
|
| 96 |
-
cursor = conn.cursor()
|
| 97 |
-
|
| 98 |
-
# Get column info
|
| 99 |
-
cursor.execute(f"PRAGMA table_info({table_name});")
|
| 100 |
-
columns = [f"{col[1]} ({col[2]})" for col in cursor.fetchall()]
|
| 101 |
-
|
| 102 |
-
# Get row count - use approximate count for large tables
|
| 103 |
-
cursor.execute(f"SELECT COUNT(*) FROM {table_name} LIMIT 1;")
|
| 104 |
-
row_count = cursor.fetchone()[0]
|
| 105 |
-
|
| 106 |
-
conn.close()
|
| 107 |
-
return columns, row_count
|
| 108 |
-
|
| 109 |
-
# Optimize the process_text_query function
|
| 110 |
def process_text_query(query, history):
|
| 111 |
"""Process a text query and update chat history"""
|
| 112 |
if not query:
|
| 113 |
return "", history
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
# Add a loading message to the history
|
| 118 |
history.append({"role": "user", "content": query})
|
| 119 |
-
history.append({"role": "assistant", "content": "Processing your query..."})
|
| 120 |
|
| 121 |
-
|
|
|
|
|
|
|
| 122 |
if current_context["file_type"] == "csv" and current_context["table_name"]:
|
| 123 |
-
table_name = current_context["table_name"]
|
| 124 |
-
|
| 125 |
try:
|
| 126 |
-
#
|
| 127 |
-
|
| 128 |
-
SELECT * FROM {table_name} LIMIT 10;
|
| 129 |
-
"""
|
| 130 |
|
| 131 |
-
#
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
sql_query = f"SELECT MAX({col}) as max_value FROM {table_name};"
|
| 137 |
-
break
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
for col in ["tip_amount", "fare_amount", "total_amount"]:
|
| 142 |
-
if col in query.lower():
|
| 143 |
-
sql_query = f"SELECT AVG({col}) as avg_value FROM {table_name};"
|
| 144 |
-
break
|
| 145 |
|
| 146 |
-
#
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
conn.execute("PRAGMA journal_mode = OFF;") # Disable journaling
|
| 150 |
-
conn.execute("PRAGMA synchronous = OFF;") # Disable synchronous writes
|
| 151 |
|
| 152 |
-
|
| 153 |
-
result_df = pd.read_sql_query(sql_query, conn, chunksize=1000)
|
| 154 |
|
| 155 |
-
#
|
| 156 |
-
|
| 157 |
-
result_df = next(result_df)
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
-
#
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
if "avg" in col.lower():
|
| 173 |
-
response += f"The average value is {result_df[col].iloc[0]:.2f}."
|
| 174 |
-
break
|
| 175 |
else:
|
| 176 |
-
response +=
|
| 177 |
-
|
| 178 |
-
response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
conn.close()
|
| 181 |
|
|
@@ -193,8 +238,8 @@ def process_text_query(query, history):
|
|
| 193 |
processing_time = time.time() - start_time
|
| 194 |
response += f"\n\n(Query processed in {processing_time:.2f} seconds)"
|
| 195 |
|
| 196 |
-
#
|
| 197 |
-
history
|
| 198 |
|
| 199 |
return "", history
|
| 200 |
|
|
@@ -223,40 +268,43 @@ def process_file_upload(files):
|
|
| 223 |
# Create table name from filename
|
| 224 |
table_name = os.path.splitext(file_name)[0].replace(' ', '_').lower()
|
| 225 |
|
| 226 |
-
#
|
| 227 |
conn = sqlite3.connect(DB_PATH)
|
| 228 |
|
| 229 |
-
#
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
# Update current context
|
| 234 |
current_context = {
|
| 235 |
"file_type": "csv",
|
| 236 |
"file_name": file_name,
|
| 237 |
-
"table_name":
|
| 238 |
}
|
| 239 |
|
| 240 |
-
# Get
|
| 241 |
cursor = conn.cursor()
|
| 242 |
-
cursor.execute(
|
| 243 |
-
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
|
|
|
| 247 |
|
| 248 |
conn.close()
|
| 249 |
|
| 250 |
file_info.append("β
CSV File Successfully Loaded")
|
| 251 |
-
file_info.append(f"π Table Name:
|
|
|
|
| 252 |
file_info.append(f"π Total Rows: {row_count:,}")
|
| 253 |
-
file_info.append(f"π Columns: {
|
| 254 |
|
| 255 |
except Exception as e:
|
| 256 |
file_info.append(f"β Error loading CSV {file_name}: {str(e)}")
|
| 257 |
-
# Print the full error for debugging
|
| 258 |
-
import traceback
|
| 259 |
-
print(traceback.format_exc())
|
| 260 |
|
| 261 |
else:
|
| 262 |
# Process PDF or other document types
|
|
@@ -279,83 +327,6 @@ def process_file_upload(files):
|
|
| 279 |
|
| 280 |
return "\n".join(file_info)
|
| 281 |
|
| 282 |
-
def process_voice_input(audio_path):
|
| 283 |
-
"""Process voice input and return transcribed text"""
|
| 284 |
-
if audio_path is None:
|
| 285 |
-
return "No audio recorded"
|
| 286 |
-
|
| 287 |
-
# Since we don't have VoiceAssistant, return a placeholder message
|
| 288 |
-
return "Voice transcription is not available"
|
| 289 |
-
|
| 290 |
-
def text_to_speech_output(text):
|
| 291 |
-
"""Convert text to speech"""
|
| 292 |
-
if not text or len(text) == 0:
|
| 293 |
-
return None
|
| 294 |
-
|
| 295 |
-
# Extract the last assistant message
|
| 296 |
-
last_message = None
|
| 297 |
-
for msg in reversed(text):
|
| 298 |
-
if msg["role"] == "assistant":
|
| 299 |
-
last_message = msg["content"]
|
| 300 |
-
break
|
| 301 |
-
|
| 302 |
-
if not last_message:
|
| 303 |
-
return None
|
| 304 |
-
|
| 305 |
-
# Since we don't have VoiceAssistant, return None
|
| 306 |
-
return None
|
| 307 |
-
|
| 308 |
-
# Optimize the load_csv_to_sqlite function
|
| 309 |
-
def load_csv_to_sqlite(file_path, conn, table_name):
|
| 310 |
-
"""Load CSV data into SQLite database with optimizations"""
|
| 311 |
-
# Use larger chunk size for faster loading
|
| 312 |
-
chunksize = 10000
|
| 313 |
-
|
| 314 |
-
# Configure SQLite for faster imports
|
| 315 |
-
conn.execute("PRAGMA synchronous = OFF")
|
| 316 |
-
conn.execute("PRAGMA journal_mode = MEMORY")
|
| 317 |
-
conn.execute("PRAGMA temp_store = MEMORY")
|
| 318 |
-
conn.execute("PRAGMA cache_size = 10000")
|
| 319 |
-
|
| 320 |
-
try:
|
| 321 |
-
# Start transaction manually
|
| 322 |
-
conn.execute("BEGIN TRANSACTION")
|
| 323 |
-
|
| 324 |
-
# Read the CSV in chunks
|
| 325 |
-
for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunksize)):
|
| 326 |
-
# Optimize column types
|
| 327 |
-
for col in chunk.columns:
|
| 328 |
-
# Convert date columns to datetime
|
| 329 |
-
if 'date' in col.lower() or 'time' in col.lower():
|
| 330 |
-
try:
|
| 331 |
-
chunk[col] = pd.to_datetime(chunk[col], errors='coerce')
|
| 332 |
-
except:
|
| 333 |
-
pass
|
| 334 |
-
|
| 335 |
-
# Load the chunk into the SQLite database
|
| 336 |
-
if_exists = 'replace' if i == 0 else 'append'
|
| 337 |
-
chunk.to_sql(table_name, conn, if_exists=if_exists, index=False, method='multi')
|
| 338 |
-
|
| 339 |
-
# Create indices for common query columns
|
| 340 |
-
for col in ['pickup_datetime', 'dropoff_datetime', 'tip_amount', 'fare_amount', 'total_amount']:
|
| 341 |
-
try:
|
| 342 |
-
if col in chunk.columns: # Only create index if column exists
|
| 343 |
-
conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{table_name}_{col} ON {table_name}({col})")
|
| 344 |
-
except Exception as idx_error:
|
| 345 |
-
print(f"Warning: Could not create index on {col}: {str(idx_error)}")
|
| 346 |
-
|
| 347 |
-
# Commit the transaction
|
| 348 |
-
conn.commit()
|
| 349 |
-
print(f"Successfully loaded {table_name} into database")
|
| 350 |
-
|
| 351 |
-
except Exception as e:
|
| 352 |
-
# Only try to rollback if we're in a transaction
|
| 353 |
-
try:
|
| 354 |
-
conn.rollback()
|
| 355 |
-
except:
|
| 356 |
-
pass # If rollback fails, just continue
|
| 357 |
-
raise e
|
| 358 |
-
|
| 359 |
def list_documents():
|
| 360 |
"""List all indexed documents"""
|
| 361 |
info_list = []
|
|
@@ -378,22 +349,7 @@ def list_documents():
|
|
| 378 |
cursor.execute(f"SELECT COUNT(*) FROM {table[0]};")
|
| 379 |
row_count = cursor.fetchone()[0]
|
| 380 |
|
| 381 |
-
|
| 382 |
-
sample_info = []
|
| 383 |
-
for col in ['vendor_id', 'rate_code', 'payment_type']:
|
| 384 |
-
if col in columns:
|
| 385 |
-
cursor.execute(f"SELECT DISTINCT {col} FROM {table[0]} LIMIT 5;")
|
| 386 |
-
unique_vals = [str(row[0]) for row in cursor.fetchall()]
|
| 387 |
-
if unique_vals:
|
| 388 |
-
sample_info.append(f"{col}: {', '.join(unique_vals)}")
|
| 389 |
-
|
| 390 |
-
info_list.append(f"\nπΉ Table: {table[0]}")
|
| 391 |
-
info_list.append(f" - Rows: {row_count:,}")
|
| 392 |
-
info_list.append(f" - Columns: {len(columns)}")
|
| 393 |
-
if sample_info:
|
| 394 |
-
info_list.append(" - Sample values:")
|
| 395 |
-
for info in sample_info:
|
| 396 |
-
info_list.append(f" β’ {info}")
|
| 397 |
|
| 398 |
conn.close()
|
| 399 |
except Exception as e:
|
|
@@ -421,6 +377,32 @@ def clear_context():
|
|
| 421 |
}
|
| 422 |
return None
|
| 423 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
# Create Gradio interface
|
| 425 |
with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
|
| 426 |
gr.Markdown("# π€ AI Document Analysis & Voice Assistant")
|
|
@@ -466,6 +448,7 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
|
|
| 466 |
)
|
| 467 |
|
| 468 |
clear_btn.click(lambda: None, None, chatbot, queue=False)
|
|
|
|
| 469 |
|
| 470 |
voice_btn.click(
|
| 471 |
lambda: gr.update(visible=True),
|
|
@@ -486,13 +469,6 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
|
|
| 486 |
inputs=[chatbot],
|
| 487 |
outputs=[audio_output]
|
| 488 |
)
|
| 489 |
-
|
| 490 |
-
# Add event handler for clear context button
|
| 491 |
-
clear_context_btn.click(
|
| 492 |
-
clear_context,
|
| 493 |
-
inputs=[],
|
| 494 |
-
outputs=[chatbot]
|
| 495 |
-
)
|
| 496 |
|
| 497 |
with gr.Tab("Document Upload"):
|
| 498 |
file_upload = gr.File(
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
import sqlite3
|
| 8 |
from langchain_core.prompts import ChatPromptTemplate
|
| 9 |
+
from langchain_groq import ChatGroq
|
| 10 |
import plotly.express as px
|
|
|
|
| 11 |
import time
|
|
|
|
| 12 |
|
| 13 |
# Load environment variables
|
| 14 |
load_dotenv()
|
|
|
|
| 17 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 18 |
|
| 19 |
from backend.main import DocumentAssistant
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
# Initialize the document assistant
|
|
|
|
|
|
|
|
|
|
| 22 |
document_assistant = DocumentAssistant()
|
| 23 |
|
| 24 |
+
# Initialize the LLM using the llama3-8b-8192 model from Groq
|
| 25 |
+
llm = ChatGroq(
|
| 26 |
+
model="llama3-8b-8192",
|
| 27 |
+
temperature=0,
|
| 28 |
+
max_tokens=None,
|
| 29 |
+
timeout=None,
|
| 30 |
+
max_retries=2,
|
| 31 |
+
verbose=True,
|
| 32 |
+
api_key=os.getenv("GROQ_API_KEY")
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
# Database path for CSV data
|
| 36 |
DB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "csv_data.db")
|
| 37 |
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
|
| 38 |
|
| 39 |
+
# Current context to track what we're working with
|
| 40 |
+
current_context = {
|
| 41 |
+
"file_type": None,
|
| 42 |
+
"file_name": None,
|
| 43 |
+
"table_name": None
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
# Define the prompt with examples for SQL query generation
|
| 47 |
+
query_prompt = ChatPromptTemplate.from_messages(
|
| 48 |
+
[
|
| 49 |
+
("system", """
|
| 50 |
+
You are an SQL and data analysis expert. Generate an appropriate SQL query using SQLite syntax for the question provided, without any explanations or code comments.
|
| 51 |
+
Follow SQLite-specific conventions, as shown in the examples below:
|
| 52 |
+
|
| 53 |
+
Example 1:
|
| 54 |
+
Question: "What is the average fare for trips over 10 miles?"
|
| 55 |
+
SQL Query: SELECT AVG(fare_amount) FROM data_tab WHERE trip_distance > 10;
|
| 56 |
+
|
| 57 |
+
Example 2:
|
| 58 |
+
Question: "How many trips were taken in each month?"
|
| 59 |
+
SQL Query: SELECT strftime('%m', pickup_datetime) AS month, COUNT(*) AS trip_count FROM data_tab GROUP BY month;
|
| 60 |
+
|
| 61 |
+
Example 3:
|
| 62 |
+
Question: "What is the total fare amount for each driver (medallion) per day?"
|
| 63 |
+
SQL Query: SELECT DATE(pickup_datetime) AS date, medallion, SUM(fare_amount) AS total_fare FROM data_tab GROUP BY date, medallion;
|
| 64 |
+
|
| 65 |
+
Example 4:
|
| 66 |
+
Question: "What is the highest tip amount in the dataset?"
|
| 67 |
+
SQL Query: SELECT MAX(tip_amount) as highest_tip FROM data_tab;
|
| 68 |
+
|
| 69 |
+
Example 5:
|
| 70 |
+
Question: "Plot a bar graph for tip trends by month"
|
| 71 |
+
SQL Query: SELECT strftime('%Y-%m', pickup_datetime) as month, AVG(tip_amount) as avg_tip, COUNT(*) as count FROM data_tab GROUP BY month ORDER BY month;
|
| 72 |
+
|
| 73 |
+
SQLite-Specific Conventions:
|
| 74 |
+
|
| 75 |
+
1. Date and Time Extraction:
|
| 76 |
+
- Instead of `EXTRACT(YEAR FROM column)`, use `strftime('%Y', column)` to extract the year.
|
| 77 |
+
- Example: `SELECT strftime('%Y', pickup_datetime) FROM data_tab;`
|
| 78 |
|
| 79 |
+
2. String Length:
|
| 80 |
+
- Instead of `CHAR_LENGTH(column)`, use `LENGTH(column)`.
|
| 81 |
+
- Example: `SELECT LENGTH(passenger_name) FROM data_tab;`
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
3. Regular Expressions:
|
| 84 |
+
- SQLite does not support `REGEXP`. Use `LIKE` for simple patterns or avoid regular expressions.
|
| 85 |
+
- Example: `SELECT * FROM data_tab WHERE passenger_name LIKE 'A%';`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
4. Window Functions:
|
| 88 |
+
- For row numbering, use `ROW_NUMBER()` if supported, or simulate with joins.
|
| 89 |
+
- Example: `SELECT id, ROW_NUMBER() OVER (ORDER BY pickup_datetime) AS row_num FROM data_tab;`
|
| 90 |
|
| 91 |
+
5. Data Type Casting:
|
| 92 |
+
- Use `CAST(column AS TYPE)`, but note that SQLite supports limited types.
|
| 93 |
+
- Example: `SELECT CAST(fare_amount AS INTEGER) FROM data_tab;`
|
| 94 |
|
| 95 |
+
6. Full Outer Join Workaround:
|
| 96 |
+
- SQLite doesn't support `FULL OUTER JOIN`. Combine `LEFT JOIN` and `UNION` for a similar effect.
|
| 97 |
+
- Example:
|
| 98 |
+
```
|
| 99 |
+
SELECT a.*, b.*
|
| 100 |
+
FROM table_a a
|
| 101 |
+
LEFT JOIN table_b b ON a.id = b.id
|
| 102 |
+
UNION
|
| 103 |
+
SELECT a.*, b.*
|
| 104 |
+
FROM table_a a
|
| 105 |
+
RIGHT JOIN table_b b ON a.id = b.id;
|
| 106 |
+
```
|
| 107 |
|
| 108 |
+
Use these examples and guidelines to generate an SQL query compatible with SQLite syntax for the question provided.
|
| 109 |
+
Always use 'data_tab' as the table name.
|
| 110 |
+
"""),
|
| 111 |
+
("human", "{question}"),
|
| 112 |
+
]
|
| 113 |
+
)
|
| 114 |
|
| 115 |
# Define the prompt for interpreting the SQL query result
|
| 116 |
interpret_prompt = ChatPromptTemplate.from_messages(
|
| 117 |
[
|
| 118 |
+
("system", "You are an experienced data analyst. Provide a concise, natural language answer based on the given data summary. If relevant, give key statistics, trends, or patterns."),
|
| 119 |
+
("human", "Question: {question}\nSQL Query: {sql_query}\nData Summary:\n{data_summary}")
|
| 120 |
]
|
| 121 |
)
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
def process_text_query(query, history):
|
| 124 |
"""Process a text query and update chat history"""
|
| 125 |
if not query:
|
| 126 |
return "", history
|
| 127 |
|
| 128 |
+
# Add the user's query to history
|
|
|
|
|
|
|
| 129 |
history.append({"role": "user", "content": query})
|
|
|
|
| 130 |
|
| 131 |
+
start_time = time.time()
|
| 132 |
+
|
| 133 |
+
# Check if we're in CSV context
|
| 134 |
if current_context["file_type"] == "csv" and current_context["table_name"]:
|
|
|
|
|
|
|
| 135 |
try:
|
| 136 |
+
# Connect to the database
|
| 137 |
+
conn = sqlite3.connect(DB_PATH)
|
|
|
|
|
|
|
| 138 |
|
| 139 |
+
# Get column information for context
|
| 140 |
+
cursor = conn.cursor()
|
| 141 |
+
cursor.execute(f"PRAGMA table_info({current_context['table_name']});")
|
| 142 |
+
columns = [info[1] for info in cursor.fetchall()]
|
| 143 |
+
columns_str = ", ".join(columns)
|
|
|
|
|
|
|
| 144 |
|
| 145 |
+
# Create question with context
|
| 146 |
+
question_with_context = f"The table 'data_tab' has columns: {columns_str}. {query}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
# Generate SQL query using LLM
|
| 149 |
+
ai_msg = query_prompt | llm
|
| 150 |
+
sql_query = ai_msg.invoke({"question": question_with_context}).content.strip()
|
|
|
|
|
|
|
| 151 |
|
| 152 |
+
print(f"Generated SQL Query: {sql_query}")
|
|
|
|
| 153 |
|
| 154 |
+
# Check if this is a visualization request
|
| 155 |
+
is_visualization = any(word in query.lower() for word in ['plot', 'graph', 'chart', 'visualize', 'visualization', 'trend'])
|
|
|
|
| 156 |
|
| 157 |
+
try:
|
| 158 |
+
# Execute the query
|
| 159 |
+
result_df = pd.read_sql_query(sql_query, conn)
|
| 160 |
+
|
| 161 |
+
# Generate data summary
|
| 162 |
+
if not result_df.empty:
|
| 163 |
+
data_summary = result_df.describe(include='all').to_string()
|
| 164 |
+
|
| 165 |
+
# For small result sets, include the actual data
|
| 166 |
+
if len(result_df) <= 10:
|
| 167 |
+
data_summary += f"\n\nFull Results:\n{result_df.to_string()}"
|
| 168 |
+
else:
|
| 169 |
+
data_summary += f"\n\nFirst 5 rows:\n{result_df.head(5).to_string()}"
|
| 170 |
+
else:
|
| 171 |
+
data_summary = "No relevant data found."
|
| 172 |
+
|
| 173 |
+
# Generate interpretation
|
| 174 |
+
answer_chain = interpret_prompt | llm
|
| 175 |
+
interpretation = answer_chain.invoke({
|
| 176 |
+
"question": query,
|
| 177 |
+
"sql_query": sql_query,
|
| 178 |
+
"data_summary": data_summary
|
| 179 |
+
}).content.strip()
|
| 180 |
|
| 181 |
+
# Create the response
|
| 182 |
+
response = f"**SQL Query:**\n```sql\n{sql_query}\n```\n\n"
|
| 183 |
+
|
| 184 |
+
if not result_df.empty:
|
| 185 |
+
if len(result_df) > 10:
|
| 186 |
+
response += f"**Results (first 5 of {len(result_df)} rows):**\n```\n{result_df.head(5).to_string()}\n```\n\n"
|
| 187 |
+
else:
|
| 188 |
+
response += f"**Results:**\n```\n{result_df.to_string()}\n```\n\n"
|
|
|
|
|
|
|
|
|
|
| 189 |
else:
|
| 190 |
+
response += "**No results found.**\n\n"
|
| 191 |
+
|
| 192 |
+
response += f"**Analysis:**\n{interpretation}"
|
| 193 |
+
|
| 194 |
+
# Add visualization if requested
|
| 195 |
+
if is_visualization and not result_df.empty:
|
| 196 |
+
try:
|
| 197 |
+
# Determine the type of visualization based on the data
|
| 198 |
+
if len(result_df.columns) >= 2:
|
| 199 |
+
# Find numeric columns for y-axis
|
| 200 |
+
numeric_cols = result_df.select_dtypes(include=['number']).columns.tolist()
|
| 201 |
+
|
| 202 |
+
if len(numeric_cols) >= 1 and len(result_df) > 1:
|
| 203 |
+
# Use the first column as x and first numeric column as y
|
| 204 |
+
x_col = result_df.columns[0]
|
| 205 |
+
y_col = numeric_cols[0]
|
| 206 |
+
|
| 207 |
+
# Create appropriate plot based on data characteristics
|
| 208 |
+
if 'month' in result_df.columns or 'date' in result_df.columns or 'year' in result_df.columns:
|
| 209 |
+
# Time series data - use line chart
|
| 210 |
+
fig = px.line(result_df, x=x_col, y=numeric_cols, title="Time Series Analysis")
|
| 211 |
+
else:
|
| 212 |
+
# Regular data - use bar chart
|
| 213 |
+
fig = px.bar(result_df, x=x_col, y=y_col, title="Data Visualization")
|
| 214 |
+
|
| 215 |
+
# Convert to HTML and add to response
|
| 216 |
+
plot_html = fig.to_html(full_html=False, include_plotlyjs='cdn')
|
| 217 |
+
response += f"\n\n**Visualization:**\n<div>{plot_html}</div>"
|
| 218 |
+
except Exception as viz_error:
|
| 219 |
+
print(f"Visualization error: {str(viz_error)}")
|
| 220 |
+
# Continue without visualization if there's an error
|
| 221 |
+
|
| 222 |
+
except Exception as e:
|
| 223 |
+
response = f"**SQL Query:**\n```sql\n{sql_query}\n```\n\n**Error executing query:** {str(e)}"
|
| 224 |
|
| 225 |
conn.close()
|
| 226 |
|
|
|
|
| 238 |
processing_time = time.time() - start_time
|
| 239 |
response += f"\n\n(Query processed in {processing_time:.2f} seconds)"
|
| 240 |
|
| 241 |
+
# Add the response to history
|
| 242 |
+
history.append({"role": "assistant", "content": response})
|
| 243 |
|
| 244 |
return "", history
|
| 245 |
|
|
|
|
| 268 |
# Create table name from filename
|
| 269 |
table_name = os.path.splitext(file_name)[0].replace(' ', '_').lower()
|
| 270 |
|
| 271 |
+
# Load CSV into SQLite
|
| 272 |
conn = sqlite3.connect(DB_PATH)
|
| 273 |
|
| 274 |
+
# Configure SQLite for faster imports
|
| 275 |
+
conn.execute("PRAGMA synchronous = OFF")
|
| 276 |
+
conn.execute("PRAGMA journal_mode = MEMORY")
|
| 277 |
+
|
| 278 |
+
# Read the CSV and load it into SQLite
|
| 279 |
+
df = pd.read_csv(file_path)
|
| 280 |
+
df.to_sql('data_tab', conn, if_exists='replace', index=False)
|
| 281 |
|
| 282 |
# Update current context
|
| 283 |
current_context = {
|
| 284 |
"file_type": "csv",
|
| 285 |
"file_name": file_name,
|
| 286 |
+
"table_name": "data_tab" # Always use data_tab as the table name
|
| 287 |
}
|
| 288 |
|
| 289 |
+
# Get column info
|
| 290 |
cursor = conn.cursor()
|
| 291 |
+
cursor.execute("PRAGMA table_info(data_tab);")
|
| 292 |
+
columns = [f"{col[1]} ({col[2]})" for col in cursor.fetchall()]
|
| 293 |
|
| 294 |
+
# Get row count
|
| 295 |
+
cursor.execute("SELECT COUNT(*) FROM data_tab;")
|
| 296 |
+
row_count = cursor.fetchone()[0]
|
| 297 |
|
| 298 |
conn.close()
|
| 299 |
|
| 300 |
file_info.append("β
CSV File Successfully Loaded")
|
| 301 |
+
file_info.append(f"π Table Name: data_tab")
|
| 302 |
+
file_info.append(f"π Source File: {file_name}")
|
| 303 |
file_info.append(f"π Total Rows: {row_count:,}")
|
| 304 |
+
file_info.append(f"π Columns: {', '.join(columns)}")
|
| 305 |
|
| 306 |
except Exception as e:
|
| 307 |
file_info.append(f"β Error loading CSV {file_name}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
else:
|
| 310 |
# Process PDF or other document types
|
|
|
|
| 327 |
|
| 328 |
return "\n".join(file_info)
|
| 329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
def list_documents():
|
| 331 |
"""List all indexed documents"""
|
| 332 |
info_list = []
|
|
|
|
| 349 |
cursor.execute(f"SELECT COUNT(*) FROM {table[0]};")
|
| 350 |
row_count = cursor.fetchone()[0]
|
| 351 |
|
| 352 |
+
info_list.append(f"- {table[0]} ({row_count:,} rows, {len(columns)} columns)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
|
| 354 |
conn.close()
|
| 355 |
except Exception as e:
|
|
|
|
| 377 |
}
|
| 378 |
return None
|
| 379 |
|
| 380 |
+
def process_voice_input(audio_path):
|
| 381 |
+
"""Process voice input and return transcribed text"""
|
| 382 |
+
if audio_path is None:
|
| 383 |
+
return "No audio recorded"
|
| 384 |
+
|
| 385 |
+
# Since we don't have VoiceAssistant, return a placeholder message
|
| 386 |
+
return "Voice transcription is not available"
|
| 387 |
+
|
| 388 |
+
def text_to_speech_output(text):
|
| 389 |
+
"""Convert text to speech"""
|
| 390 |
+
if not text or len(text) == 0:
|
| 391 |
+
return None
|
| 392 |
+
|
| 393 |
+
# Extract the last assistant message
|
| 394 |
+
last_message = None
|
| 395 |
+
for msg in reversed(text):
|
| 396 |
+
if msg["role"] == "assistant":
|
| 397 |
+
last_message = msg["content"]
|
| 398 |
+
break
|
| 399 |
+
|
| 400 |
+
if not last_message:
|
| 401 |
+
return None
|
| 402 |
+
|
| 403 |
+
# Since we don't have VoiceAssistant, return None
|
| 404 |
+
return None
|
| 405 |
+
|
| 406 |
# Create Gradio interface
|
| 407 |
with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
|
| 408 |
gr.Markdown("# π€ AI Document Analysis & Voice Assistant")
|
|
|
|
| 448 |
)
|
| 449 |
|
| 450 |
clear_btn.click(lambda: None, None, chatbot, queue=False)
|
| 451 |
+
clear_context_btn.click(clear_context, inputs=[], outputs=[chatbot])
|
| 452 |
|
| 453 |
voice_btn.click(
|
| 454 |
lambda: gr.update(visible=True),
|
|
|
|
| 469 |
inputs=[chatbot],
|
| 470 |
outputs=[audio_output]
|
| 471 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
with gr.Tab("Document Upload"):
|
| 474 |
file_upload = gr.File(
|