Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import sqlite3
|
|
| 6 |
import pandas as pd
|
| 7 |
from datetime import datetime, timedelta
|
| 8 |
import random
|
|
|
|
| 9 |
|
| 10 |
# Pydantic models for structured output
|
| 11 |
class ValidationStatus(BaseModel):
|
|
@@ -20,81 +21,251 @@ class SQLQueryGeneration(BaseModel):
|
|
| 20 |
execution_notes: list[str]
|
| 21 |
validation_status: ValidationStatus
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
def
|
| 25 |
-
"""Generate sample
|
| 26 |
-
first_names = ["Alice", "Bob", "Carol", "David", "Emma", "Frank", "Grace", "Henry", "Ivy", "Jack"]
|
| 27 |
-
last_names = ["Johnson", "Smith", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
lname = random.choice(last_names)
|
| 33 |
-
customers.append({
|
| 34 |
-
'customer_id': i,
|
| 35 |
-
'name': f"{fname} {lname}",
|
| 36 |
-
'email': f"{fname.lower()}{i}@example.com"
|
| 37 |
-
})
|
| 38 |
-
return customers
|
| 39 |
-
|
| 40 |
-
def generate_sample_orders(customer_count=10, order_count=20):
|
| 41 |
-
"""Generate sample order data"""
|
| 42 |
-
orders = []
|
| 43 |
-
base_date = datetime.now()
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
def create_database_from_tables(tables_used):
|
| 74 |
-
"""Create SQLite database with sample data
|
| 75 |
conn = sqlite3.connect(':memory:')
|
| 76 |
cursor = conn.cursor()
|
| 77 |
|
| 78 |
sample_data = {}
|
| 79 |
|
| 80 |
-
# Generate data
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
products = generate_sample_products(15)
|
| 95 |
-
df_products = pd.DataFrame(products)
|
| 96 |
-
df_products.to_sql('products', conn, index=False, if_exists='replace')
|
| 97 |
-
sample_data['products'] = df_products
|
| 98 |
|
| 99 |
return conn, sample_data
|
| 100 |
|
|
@@ -109,10 +280,10 @@ def execute_sql_on_sample_data(sql_query, conn):
|
|
| 109 |
def process_nl_query(api_key, natural_query):
|
| 110 |
"""Main function to process natural language query"""
|
| 111 |
if not api_key:
|
| 112 |
-
return "β Please enter your Groq API key", "",
|
| 113 |
|
| 114 |
if not natural_query:
|
| 115 |
-
return "β Please enter a natural language query", "",
|
| 116 |
|
| 117 |
try:
|
| 118 |
# Initialize Groq client
|
|
@@ -123,7 +294,7 @@ def process_nl_query(api_key, natural_query):
|
|
| 123 |
output_text += "### Step 1: Understanding User Intent\n"
|
| 124 |
output_text += f"**User Query:** {natural_query}\n\n"
|
| 125 |
|
| 126 |
-
# Call Groq API for SQL generation
|
| 127 |
response = client.chat.completions.create(
|
| 128 |
model="moonshotai/kimi-k2-instruct-0905",
|
| 129 |
messages=[
|
|
@@ -131,7 +302,7 @@ def process_nl_query(api_key, natural_query):
|
|
| 131 |
"role": "system",
|
| 132 |
"content": """You are a SQL expert. Generate structured SQL queries from natural language descriptions with proper syntax validation and metadata.
|
| 133 |
|
| 134 |
-
Return your response in JSON format with the following structure:
|
| 135 |
{
|
| 136 |
"query": "SQL query string",
|
| 137 |
"query_type": "SELECT/INSERT/UPDATE/DELETE",
|
|
@@ -144,7 +315,14 @@ def process_nl_query(api_key, natural_query):
|
|
| 144 |
}
|
| 145 |
}
|
| 146 |
|
| 147 |
-
Use standard SQL syntax compatible with SQLite.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
},
|
| 149 |
{
|
| 150 |
"role": "user",
|
|
@@ -186,12 +364,16 @@ def process_nl_query(api_key, natural_query):
|
|
| 186 |
|
| 187 |
# Step 3: Generate Sample Database Tables
|
| 188 |
output_text += "### Step 3: Auto-Generated Sample Database Tables\n\n"
|
|
|
|
|
|
|
| 189 |
conn, sample_data = create_database_from_tables(sql_query_gen.tables_used)
|
| 190 |
|
| 191 |
-
# Display sample tables
|
| 192 |
for table_name, df in sample_data.items():
|
| 193 |
-
output_text += f"**π Sample `{table_name}` Table
|
| 194 |
-
output_text += df.to_markdown(index=False)
|
|
|
|
|
|
|
| 195 |
output_text += "\n\n"
|
| 196 |
|
| 197 |
# Step 4: Execute SQL Query
|
|
@@ -202,10 +384,10 @@ def process_nl_query(api_key, natural_query):
|
|
| 202 |
|
| 203 |
if error:
|
| 204 |
output_text += f"β **Execution Error:** {error}\n"
|
| 205 |
-
result_table =
|
| 206 |
else:
|
| 207 |
output_text += "β
**Query executed successfully!**\n\n"
|
| 208 |
-
output_text += "**π SQL Execution Result
|
| 209 |
if len(result_df) > 0:
|
| 210 |
output_text += result_df.to_markdown(index=False)
|
| 211 |
else:
|
|
@@ -217,19 +399,14 @@ def process_nl_query(api_key, natural_query):
|
|
| 217 |
# Format outputs for Gradio
|
| 218 |
json_output = json.dumps(sql_query_gen.model_dump(), indent=2)
|
| 219 |
|
| 220 |
-
|
| 221 |
-
result_display = result_df
|
| 222 |
-
else:
|
| 223 |
-
result_display = pd.DataFrame({"Error": [error]})
|
| 224 |
-
|
| 225 |
-
return output_text, json_output, result_display, sql_query_gen.query
|
| 226 |
|
| 227 |
except Exception as e:
|
| 228 |
-
error_msg = f"β **Error:** {str(e)}\n\nPlease check your API key and
|
| 229 |
-
return error_msg, "", pd.DataFrame(), ""
|
| 230 |
|
| 231 |
# Create Gradio Interface
|
| 232 |
-
with gr.Blocks(title="Natural Language to SQL Query Executor", theme=gr.themes.
|
| 233 |
gr.Markdown("""
|
| 234 |
# π Natural Language to SQL Query Executor
|
| 235 |
|
|
@@ -237,8 +414,10 @@ with gr.Blocks(title="Natural Language to SQL Query Executor", theme=gr.themes.S
|
|
| 237 |
|
| 238 |
**Example queries to try:**
|
| 239 |
- "Find all customers who made orders over $500 in the last 30 days, show their name, email, and total order amount"
|
| 240 |
-
- "Show all
|
| 241 |
-
- "List
|
|
|
|
|
|
|
| 242 |
""")
|
| 243 |
|
| 244 |
with gr.Row():
|
|
@@ -276,7 +455,8 @@ with gr.Blocks(title="Natural Language to SQL Query Executor", theme=gr.themes.S
|
|
| 276 |
gr.Markdown("### π Query Execution Result")
|
| 277 |
result_output = gr.Dataframe(
|
| 278 |
label="Result Table",
|
| 279 |
-
interactive=False
|
|
|
|
| 280 |
)
|
| 281 |
|
| 282 |
# Connect the button to the processing function
|
|
@@ -289,20 +469,23 @@ with gr.Blocks(title="Natural Language to SQL Query Executor", theme=gr.themes.S
|
|
| 289 |
gr.Markdown("""
|
| 290 |
---
|
| 291 |
### π How it works:
|
| 292 |
-
1. **Enter your Groq API key** - Required for SQL generation
|
| 293 |
2. **Write your query in plain English** - Describe what data you want to find
|
| 294 |
3. **Click Generate & Execute** - The system will:
|
| 295 |
- Convert your query to SQL
|
| 296 |
-
-
|
|
|
|
| 297 |
- Execute the query
|
| 298 |
- Show you the results
|
| 299 |
|
| 300 |
### π― Features:
|
| 301 |
-
- β
Natural language to SQL conversion
|
| 302 |
-
- β
|
|
|
|
| 303 |
- β
Query validation and metadata
|
| 304 |
- β
SQL execution on sample data
|
| 305 |
- β
Structured JSON output format
|
|
|
|
| 306 |
""")
|
| 307 |
|
| 308 |
# Launch the app
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
from datetime import datetime, timedelta
|
| 8 |
import random
|
| 9 |
+
import re
|
| 10 |
|
| 11 |
# Pydantic models for structured output
|
| 12 |
class ValidationStatus(BaseModel):
|
|
|
|
| 21 |
execution_notes: list[str]
|
| 22 |
validation_status: ValidationStatus
|
| 23 |
|
| 24 |
+
# Enhanced data generators for ANY table type
|
| 25 |
+
def generate_generic_table_data(table_name, row_count=15):
|
| 26 |
+
"""Generate sample data for ANY table based on common patterns"""
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
# Define field generators
|
| 29 |
+
def gen_id():
|
| 30 |
+
return list(range(1, row_count + 1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
def gen_names():
|
| 33 |
+
first = ["Alice", "Bob", "Carol", "David", "Emma", "Frank", "Grace", "Henry", "Ivy", "Jack",
|
| 34 |
+
"Karen", "Leo", "Maria", "Nathan", "Olivia"]
|
| 35 |
+
last = ["Johnson", "Smith", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis",
|
| 36 |
+
"Rodriguez", "Martinez", "Anderson", "Taylor", "Thomas", "Moore", "Jackson"]
|
| 37 |
+
return [f"{random.choice(first)} {random.choice(last)}" for _ in range(row_count)]
|
| 38 |
+
|
| 39 |
+
def gen_emails(names=None):
|
| 40 |
+
if names:
|
| 41 |
+
return [f"{name.lower().replace(' ', '.')}@example.com" for name in names]
|
| 42 |
+
return [f"user{i}@example.com" for i in range(1, row_count + 1)]
|
| 43 |
+
|
| 44 |
+
def gen_dates(days_back=365):
|
| 45 |
+
base = datetime.now()
|
| 46 |
+
return [(base - timedelta(days=random.randint(0, days_back))).strftime('%Y-%m-%d')
|
| 47 |
+
for _ in range(row_count)]
|
| 48 |
+
|
| 49 |
+
def gen_amounts():
|
| 50 |
+
return [round(random.uniform(100, 5000), 2) for _ in range(row_count)]
|
| 51 |
+
|
| 52 |
+
def gen_salaries():
|
| 53 |
+
return [random.choice([45000, 55000, 65000, 75000, 85000, 95000, 105000, 120000])
|
| 54 |
+
for _ in range(row_count)]
|
| 55 |
+
|
| 56 |
+
def gen_prices():
|
| 57 |
+
return [round(random.uniform(10, 1000), 2) for _ in range(row_count)]
|
| 58 |
+
|
| 59 |
+
def gen_quantities():
|
| 60 |
+
return [random.randint(0, 100) for _ in range(row_count)]
|
| 61 |
+
|
| 62 |
+
def gen_ratings():
|
| 63 |
+
return [round(random.uniform(1, 10), 1) for _ in range(row_count)]
|
| 64 |
+
|
| 65 |
+
def gen_scores():
|
| 66 |
+
return [random.randint(60, 100) for _ in range(row_count)]
|
| 67 |
+
|
| 68 |
+
def gen_ages():
|
| 69 |
+
return [random.randint(18, 80) for _ in range(row_count)]
|
| 70 |
+
|
| 71 |
+
def gen_boolean():
|
| 72 |
+
return [random.choice([True, False, True, True]) for _ in range(row_count)]
|
| 73 |
|
| 74 |
+
def gen_status():
|
| 75 |
+
return [random.choice(['Active', 'Inactive', 'Pending', 'Active', 'Active'])
|
| 76 |
+
for _ in range(row_count)]
|
| 77 |
+
|
| 78 |
+
# Table-specific schemas with intelligent field detection
|
| 79 |
+
table_schemas = {
|
| 80 |
+
'employees': {
|
| 81 |
+
'employee_id': gen_id(),
|
| 82 |
+
'name': gen_names(),
|
| 83 |
+
'email': gen_emails(gen_names()),
|
| 84 |
+
'department_id': [random.randint(1, 5) for _ in range(row_count)],
|
| 85 |
+
'salary': gen_salaries(),
|
| 86 |
+
'hire_date': gen_dates(1825),
|
| 87 |
+
'position': [random.choice(['Engineer', 'Manager', 'Analyst', 'Developer', 'Designer'])
|
| 88 |
+
for _ in range(row_count)]
|
| 89 |
+
},
|
| 90 |
+
'departments': {
|
| 91 |
+
'id': list(range(1, 6)),
|
| 92 |
+
'name': ['Engineering', 'Sales', 'Marketing', 'HR', 'Finance'],
|
| 93 |
+
'manager_id': [random.randint(1, 15) for _ in range(5)],
|
| 94 |
+
'budget': [random.randint(100000, 1000000) for _ in range(5)]
|
| 95 |
+
}[:5],
|
| 96 |
+
'books': {
|
| 97 |
+
'book_id': gen_id(),
|
| 98 |
+
'title': [f"Book Title {i}" for i in range(1, row_count + 1)],
|
| 99 |
+
'author': gen_names(),
|
| 100 |
+
'publication_year': [random.randint(2000, 2025) for _ in range(row_count)],
|
| 101 |
+
'isbn': [f"978-{random.randint(1000000000, 9999999999)}" for _ in range(row_count)],
|
| 102 |
+
'available': gen_boolean(),
|
| 103 |
+
'category': [random.choice(['Fiction', 'Science', 'History', 'Technology', 'Arts'])
|
| 104 |
+
for _ in range(row_count)]
|
| 105 |
+
},
|
| 106 |
+
'students': {
|
| 107 |
+
'student_id': gen_id(),
|
| 108 |
+
'name': gen_names(),
|
| 109 |
+
'email': gen_emails(gen_names()),
|
| 110 |
+
'age': [random.randint(18, 25) for _ in range(row_count)],
|
| 111 |
+
'major': [random.choice(['Computer Science', 'Engineering', 'Business', 'Mathematics', 'Physics'])
|
| 112 |
+
for _ in range(row_count)],
|
| 113 |
+
'gpa': [round(random.uniform(2.5, 4.0), 2) for _ in range(row_count)],
|
| 114 |
+
'enrollment_year': [random.randint(2020, 2025) for _ in range(row_count)]
|
| 115 |
+
},
|
| 116 |
+
'courses': {
|
| 117 |
+
'course_id': gen_id(),
|
| 118 |
+
'course_name': [f"Course {i}" for i in range(1, row_count + 1)],
|
| 119 |
+
'subject': [random.choice(['Mathematics', 'Computer Science', 'Physics', 'Chemistry'])
|
| 120 |
+
for _ in range(row_count)],
|
| 121 |
+
'credits': [random.choice([3, 4, 5]) for _ in range(row_count)],
|
| 122 |
+
'instructor': gen_names()
|
| 123 |
+
},
|
| 124 |
+
'grades': {
|
| 125 |
+
'grade_id': gen_id(),
|
| 126 |
+
'student_id': [random.randint(1, 15) for _ in range(row_count)],
|
| 127 |
+
'course_id': [random.randint(1, 15) for _ in range(row_count)],
|
| 128 |
+
'score': gen_scores(),
|
| 129 |
+
'grade_date': gen_dates(180)
|
| 130 |
+
},
|
| 131 |
+
'items': {
|
| 132 |
+
'item_id': gen_id(),
|
| 133 |
+
'item_name': [f"Item {i}" for i in range(1, row_count + 1)],
|
| 134 |
+
'category': [random.choice(['Electronics', 'Furniture', 'Supplies', 'Equipment'])
|
| 135 |
+
for _ in range(row_count)],
|
| 136 |
+
'stock_level': gen_quantities(),
|
| 137 |
+
'reorder_point': [random.randint(10, 30) for _ in range(row_count)],
|
| 138 |
+
'price': gen_prices()
|
| 139 |
+
},
|
| 140 |
+
'movies': {
|
| 141 |
+
'movie_id': gen_id(),
|
| 142 |
+
'title': [f"Movie Title {i}" for i in range(1, row_count + 1)],
|
| 143 |
+
'director': gen_names(),
|
| 144 |
+
'release_year': [random.randint(2015, 2025) for _ in range(row_count)],
|
| 145 |
+
'rating': gen_ratings(),
|
| 146 |
+
'genre': [random.choice(['Action', 'Drama', 'Comedy', 'Sci-Fi', 'Thriller'])
|
| 147 |
+
for _ in range(row_count)],
|
| 148 |
+
'duration_minutes': [random.randint(90, 180) for _ in range(row_count)]
|
| 149 |
+
},
|
| 150 |
+
'patients': {
|
| 151 |
+
'patient_id': gen_id(),
|
| 152 |
+
'name': gen_names(),
|
| 153 |
+
'age': gen_ages(),
|
| 154 |
+
'email': gen_emails(gen_names()),
|
| 155 |
+
'phone': [f"+1-555-{random.randint(1000, 9999)}" for _ in range(row_count)],
|
| 156 |
+
'last_visit': gen_dates(90),
|
| 157 |
+
'condition': [random.choice(['Diabetes', 'Hypertension', 'Asthma', 'Healthy'])
|
| 158 |
+
for _ in range(row_count)]
|
| 159 |
+
},
|
| 160 |
+
'appointments': {
|
| 161 |
+
'appointment_id': gen_id(),
|
| 162 |
+
'patient_id': [random.randint(1, 15) for _ in range(row_count)],
|
| 163 |
+
'doctor_name': gen_names(),
|
| 164 |
+
'appointment_date': gen_dates(60),
|
| 165 |
+
'status': [random.choice(['Scheduled', 'Completed', 'Cancelled']) for _ in range(row_count)]
|
| 166 |
+
},
|
| 167 |
+
'properties': {
|
| 168 |
+
'property_id': gen_id(),
|
| 169 |
+
'address': [f"{random.randint(100, 9999)} Main St" for _ in range(row_count)],
|
| 170 |
+
'city': [random.choice(['Downtown', 'Suburbs', 'Uptown', 'Eastside']) for _ in range(row_count)],
|
| 171 |
+
'price': [random.randint(150000, 800000) for _ in range(row_count)],
|
| 172 |
+
'bedrooms': [random.randint(1, 5) for _ in range(row_count)],
|
| 173 |
+
'bathrooms': [random.randint(1, 3) for _ in range(row_count)],
|
| 174 |
+
'sqft': [random.randint(800, 3500) for _ in range(row_count)],
|
| 175 |
+
'status': [random.choice(['Available', 'Sold', 'Pending']) for _ in range(row_count)]
|
| 176 |
+
},
|
| 177 |
+
'events': {
|
| 178 |
+
'event_id': gen_id(),
|
| 179 |
+
'event_name': [f"Event {i}" for i in range(1, row_count + 1)],
|
| 180 |
+
'event_date': [datetime(2026, 1, random.randint(1, 31)).strftime('%Y-%m-%d')
|
| 181 |
+
for _ in range(row_count)],
|
| 182 |
+
'location': [random.choice(['Hall A', 'Conference Room', 'Auditorium', 'Stadium'])
|
| 183 |
+
for _ in range(row_count)],
|
| 184 |
+
'attendees': [random.randint(10, 200) for _ in range(row_count)],
|
| 185 |
+
'status': [random.choice(['Upcoming', 'Completed', 'Cancelled']) for _ in range(row_count)]
|
| 186 |
+
},
|
| 187 |
+
'dishes': {
|
| 188 |
+
'dish_id': gen_id(),
|
| 189 |
+
'dish_name': [f"Dish {i}" for i in range(1, row_count + 1)],
|
| 190 |
+
'category': [random.choice(['Appetizer', 'Main Course', 'Dessert', 'Beverage'])
|
| 191 |
+
for _ in range(row_count)],
|
| 192 |
+
'price': [round(random.uniform(5, 50), 2) for _ in range(row_count)],
|
| 193 |
+
'preparation_time': [random.randint(10, 60) for _ in range(row_count)]
|
| 194 |
+
},
|
| 195 |
+
'orders': {
|
| 196 |
+
'order_id': gen_id(),
|
| 197 |
+
'customer_id': [random.randint(1, 15) for _ in range(row_count)],
|
| 198 |
+
'dish_id': [random.randint(1, 15) for _ in range(row_count)],
|
| 199 |
+
'quantity': [random.randint(1, 5) for _ in range(row_count)],
|
| 200 |
+
'order_date': gen_dates(30),
|
| 201 |
+
'total_amount': gen_amounts()
|
| 202 |
+
},
|
| 203 |
+
'members': {
|
| 204 |
+
'member_id': gen_id(),
|
| 205 |
+
'name': gen_names(),
|
| 206 |
+
'email': gen_emails(gen_names()),
|
| 207 |
+
'membership_type': [random.choice(['Basic', 'Premium', 'VIP']) for _ in range(row_count)],
|
| 208 |
+
'join_date': gen_dates(730),
|
| 209 |
+
'expiry_date': [(datetime.now() + timedelta(days=random.randint(-30, 90))).strftime('%Y-%m-%d')
|
| 210 |
+
for _ in range(row_count)],
|
| 211 |
+
'status': [random.choice(['Active', 'Active', 'Active', 'Inactive']) for _ in range(row_count)]
|
| 212 |
+
},
|
| 213 |
+
'customers': {
|
| 214 |
+
'customer_id': gen_id(),
|
| 215 |
+
'name': gen_names(),
|
| 216 |
+
'email': gen_emails(gen_names()),
|
| 217 |
+
'phone': [f"+1-555-{random.randint(1000, 9999)}" for _ in range(row_count)],
|
| 218 |
+
'registration_date': gen_dates(365),
|
| 219 |
+
'status': gen_status()
|
| 220 |
+
},
|
| 221 |
+
'products': {
|
| 222 |
+
'product_id': gen_id(),
|
| 223 |
+
'product_name': [f"Product {i}" for i in range(1, row_count + 1)],
|
| 224 |
+
'category': [random.choice(['Electronics', 'Clothing', 'Home', 'Sports', 'Books'])
|
| 225 |
+
for _ in range(row_count)],
|
| 226 |
+
'price': gen_prices(),
|
| 227 |
+
'stock_quantity': gen_quantities(),
|
| 228 |
+
'supplier_id': [random.randint(1, 5) for _ in range(row_count)]
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
# Return predefined schema if exists, otherwise create generic one
|
| 233 |
+
if table_name.lower() in table_schemas:
|
| 234 |
+
return table_schemas[table_name.lower()]
|
| 235 |
+
|
| 236 |
+
# Generic fallback for unknown tables
|
| 237 |
+
# Try to infer structure from table name
|
| 238 |
+
generic_data = {
|
| 239 |
+
f'{table_name}_id': gen_id(),
|
| 240 |
+
'name': gen_names(),
|
| 241 |
+
'created_date': gen_dates(),
|
| 242 |
+
'status': gen_status(),
|
| 243 |
+
'value': gen_amounts()
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
return generic_data
|
| 247 |
|
| 248 |
def create_database_from_tables(tables_used):
|
| 249 |
+
"""Create SQLite database with sample data for ALL tables mentioned"""
|
| 250 |
conn = sqlite3.connect(':memory:')
|
| 251 |
cursor = conn.cursor()
|
| 252 |
|
| 253 |
sample_data = {}
|
| 254 |
|
| 255 |
+
# Generate data for each table mentioned
|
| 256 |
+
for table in tables_used:
|
| 257 |
+
table_name = table.lower().strip()
|
| 258 |
+
|
| 259 |
+
# Generate appropriate sample data
|
| 260 |
+
table_dict = generate_generic_table_data(table_name, row_count=15)
|
| 261 |
+
|
| 262 |
+
# Adjust row count for lookup tables
|
| 263 |
+
if table_name in ['departments']:
|
| 264 |
+
table_dict = {k: v[:5] if isinstance(v, list) else v for k, v in table_dict.items()}
|
| 265 |
+
|
| 266 |
+
df = pd.DataFrame(table_dict)
|
| 267 |
+
df.to_sql(table_name, conn, index=False, if_exists='replace')
|
| 268 |
+
sample_data[table_name] = df
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
return conn, sample_data
|
| 271 |
|
|
|
|
| 280 |
def process_nl_query(api_key, natural_query):
|
| 281 |
"""Main function to process natural language query"""
|
| 282 |
if not api_key:
|
| 283 |
+
return "β Please enter your Groq API key", "", pd.DataFrame(), ""
|
| 284 |
|
| 285 |
if not natural_query:
|
| 286 |
+
return "β Please enter a natural language query", "", pd.DataFrame(), ""
|
| 287 |
|
| 288 |
try:
|
| 289 |
# Initialize Groq client
|
|
|
|
| 294 |
output_text += "### Step 1: Understanding User Intent\n"
|
| 295 |
output_text += f"**User Query:** {natural_query}\n\n"
|
| 296 |
|
| 297 |
+
# Call Groq API for SQL generation with Kimi model
|
| 298 |
response = client.chat.completions.create(
|
| 299 |
model="moonshotai/kimi-k2-instruct-0905",
|
| 300 |
messages=[
|
|
|
|
| 302 |
"role": "system",
|
| 303 |
"content": """You are a SQL expert. Generate structured SQL queries from natural language descriptions with proper syntax validation and metadata.
|
| 304 |
|
| 305 |
+
IMPORTANT: Return your response in JSON format with the following structure:
|
| 306 |
{
|
| 307 |
"query": "SQL query string",
|
| 308 |
"query_type": "SELECT/INSERT/UPDATE/DELETE",
|
|
|
|
| 315 |
}
|
| 316 |
}
|
| 317 |
|
| 318 |
+
Use standard SQL syntax compatible with SQLite.
|
| 319 |
+
- Always use proper JOINs when multiple tables are involved
|
| 320 |
+
- Use WHERE clauses for filtering
|
| 321 |
+
- Use GROUP BY for aggregations
|
| 322 |
+
- For date comparisons, use date('now') and datetime functions
|
| 323 |
+
- Extract ALL table names mentioned or implied in the query and list them in "tables_used"
|
| 324 |
+
- If a query mentions departments and employees, include BOTH tables
|
| 325 |
+
- Be thorough in identifying all tables needed for the query""",
|
| 326 |
},
|
| 327 |
{
|
| 328 |
"role": "user",
|
|
|
|
| 364 |
|
| 365 |
# Step 3: Generate Sample Database Tables
|
| 366 |
output_text += "### Step 3: Auto-Generated Sample Database Tables\n\n"
|
| 367 |
+
output_text += f"**Tables to be created:** {', '.join(sql_query_gen.tables_used)}\n\n"
|
| 368 |
+
|
| 369 |
conn, sample_data = create_database_from_tables(sql_query_gen.tables_used)
|
| 370 |
|
| 371 |
+
# Display sample tables (show first 10 rows for readability)
|
| 372 |
for table_name, df in sample_data.items():
|
| 373 |
+
output_text += f"**π Sample `{table_name}` Table** ({len(df)} rows):\n\n"
|
| 374 |
+
output_text += df.head(10).to_markdown(index=False)
|
| 375 |
+
if len(df) > 10:
|
| 376 |
+
output_text += f"\n\n*...and {len(df) - 10} more rows*"
|
| 377 |
output_text += "\n\n"
|
| 378 |
|
| 379 |
# Step 4: Execute SQL Query
|
|
|
|
| 384 |
|
| 385 |
if error:
|
| 386 |
output_text += f"β **Execution Error:** {error}\n"
|
| 387 |
+
result_table = pd.DataFrame({"Error": [error]})
|
| 388 |
else:
|
| 389 |
output_text += "β
**Query executed successfully!**\n\n"
|
| 390 |
+
output_text += f"**π SQL Execution Result** ({len(result_df)} rows returned):\n\n"
|
| 391 |
if len(result_df) > 0:
|
| 392 |
output_text += result_df.to_markdown(index=False)
|
| 393 |
else:
|
|
|
|
| 399 |
# Format outputs for Gradio
|
| 400 |
json_output = json.dumps(sql_query_gen.model_dump(), indent=2)
|
| 401 |
|
| 402 |
+
return output_text, json_output, result_table, sql_query_gen.query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
except Exception as e:
|
| 405 |
+
error_msg = f"β **Error:** {str(e)}\n\n**Full error details:**\n```\n{repr(e)}\n```\n\nPlease check your API key and try again."
|
| 406 |
+
return error_msg, "", pd.DataFrame({"Error": [str(e)]}), ""
|
| 407 |
|
| 408 |
# Create Gradio Interface
|
| 409 |
+
with gr.Blocks(title="Natural Language to SQL Query Executor", theme=gr.themes.Ocean()) as demo:
|
| 410 |
gr.Markdown("""
|
| 411 |
# π Natural Language to SQL Query Executor
|
| 412 |
|
|
|
|
| 414 |
|
| 415 |
**Example queries to try:**
|
| 416 |
- "Find all customers who made orders over $500 in the last 30 days, show their name, email, and total order amount"
|
| 417 |
+
- "Show all employees who earn more than $75,000 and work in the Engineering department"
|
| 418 |
+
- "List students who scored above 85% in Mathematics"
|
| 419 |
+
- "Find all books published after 2020 that are currently available"
|
| 420 |
+
- "Show properties with price between $200,000 and $500,000"
|
| 421 |
""")
|
| 422 |
|
| 423 |
with gr.Row():
|
|
|
|
| 455 |
gr.Markdown("### π Query Execution Result")
|
| 456 |
result_output = gr.Dataframe(
|
| 457 |
label="Result Table",
|
| 458 |
+
interactive=False,
|
| 459 |
+
wrap=True
|
| 460 |
)
|
| 461 |
|
| 462 |
# Connect the button to the processing function
|
|
|
|
| 469 |
gr.Markdown("""
|
| 470 |
---
|
| 471 |
### π How it works:
|
| 472 |
+
1. **Enter your Groq API key** - Required for SQL generation (using Kimi K2 Instruct model)
|
| 473 |
2. **Write your query in plain English** - Describe what data you want to find
|
| 474 |
3. **Click Generate & Execute** - The system will:
|
| 475 |
- Convert your query to SQL
|
| 476 |
+
- Automatically detect and create ALL required tables
|
| 477 |
+
- Generate realistic sample data for those tables
|
| 478 |
- Execute the query
|
| 479 |
- Show you the results
|
| 480 |
|
| 481 |
### π― Features:
|
| 482 |
+
- β
Natural language to SQL conversion using Kimi K2 Instruct
|
| 483 |
+
- β
**Smart table detection** - Creates ANY table mentioned in your query
|
| 484 |
+
- β
Automatic sample data generation for 15+ table types
|
| 485 |
- β
Query validation and metadata
|
| 486 |
- β
SQL execution on sample data
|
| 487 |
- β
Structured JSON output format
|
| 488 |
+
- β
Support for employees, books, students, movies, patients, properties, events, and more!
|
| 489 |
""")
|
| 490 |
|
| 491 |
# Launch the app
|