shukdevdattaEX commited on
Commit
1029219
Β·
verified Β·
1 Parent(s): 82a80ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -452
app.py CHANGED
@@ -4,12 +4,10 @@ from pydantic import BaseModel
4
  import json
5
  import sqlite3
6
  import pandas as pd
7
- from typing import List, Optional
8
- import re
9
  from datetime import datetime, timedelta
10
  import random
11
 
12
- # Pydantic Models
13
  class ValidationStatus(BaseModel):
14
  is_valid: bool
15
  syntax_errors: list[str]
@@ -22,526 +20,269 @@ class SQLQueryGeneration(BaseModel):
22
  execution_notes: list[str]
23
  validation_status: ValidationStatus
24
 
25
- class TableSchema(BaseModel):
26
- table_name: str
27
- columns: list[dict]
28
- sample_data: list[dict]
29
-
30
- def generate_sample_data(user_query: str, groq_api_key: str) -> dict:
31
- """Generate sample table schema and data based on user query"""
32
- try:
33
- client = Groq(api_key=groq_api_key)
34
-
35
- # Get current date for context
36
- today = datetime.now().strftime('%Y-%m-%d')
37
- past_date_2y = (datetime.now() - timedelta(days=730)).strftime('%Y-%m-%d') # 2 years ago
38
- past_date_60d = (datetime.now() - timedelta(days=60)).strftime('%Y-%m-%d') # 60 days ago
39
-
40
- # Request to generate table schema and sample data
41
- schema_prompt = f"""Based on this query: "{user_query}"
42
-
43
- **Current date: {today}**
44
-
45
- Generate a realistic database schema with sample data. Return ONLY valid JSON with this structure:
46
- {{
47
- "tables": [
48
- {{
49
- "table_name": "table_name",
50
- "columns": [
51
- {{"name": "column_name", "type": "INTEGER|TEXT|REAL|DATE"}},
52
- ...
53
- ],
54
- "sample_data": [
55
- {{"column_name": value, ...}},
56
- ...at least 20-25 rows
57
- ]
58
- }}
59
- ]
60
- }}
61
-
62
- **CRITICAL INSTRUCTIONS FOR REALISTIC DATA:**
63
-
64
- 1. **DATES MUST BE IN THE PAST!**
65
- - For hire_date, created_at, registration_date: Use dates between {past_date_2y} and {today}
66
- - For order_date, transaction_date: If query mentions "last X days", use dates between {past_date_60d} and {today}
67
- - NEVER use future dates!
68
-
69
- 2. **For NUMERIC filters (salary, amount, price):**
70
- - If query says "over $80000", make 50-60% of records have values ABOVE 80000
71
- - Create realistic variation: some at 85k, some at 95k, some at 120k, etc.
72
- - Also include records BELOW the threshold (40-50%) for realism
73
-
74
- 3. **For TEXT filters (department, category, status):**
75
- - If query mentions "Engineering department", ensure 50-60% of records have department = "Engineering"
76
- - Include other departments too: "Marketing", "Sales", "HR", "Finance" for variety
77
-
78
- 4. **Data quality:**
79
- - Use realistic names, emails (first.last@company.com format)
80
- - Make data diverse and meaningful
81
- - Ensure enough records match the query criteria to get meaningful results
82
-
83
- Example: For "Find Engineering employees with salary > 80000"
84
- - Create 20+ employee records
85
- - 12-15 should be in Engineering (60%)
86
- - Of Engineering employees, 8-10 should have salary > 80000
87
- - Include other departments with various salaries for realism"""
88
 
89
- response = client.chat.completions.create(
90
- model="moonshotai/kimi-k2-instruct-0905",
91
- messages=[
92
- {"role": "system", "content": "You are a database expert. Generate realistic table schemas and sample data. ALL DATES MUST BE IN THE PAST, NEVER IN THE FUTURE. Return ONLY valid JSON, no markdown formatting."},
93
- {"role": "user", "content": schema_prompt}
94
- ],
95
- temperature=0.7
96
- )
97
-
98
- # Parse response
99
- content = response.choices[0].message.content.strip()
100
- # Remove markdown code blocks if present
101
- content = re.sub(r'```json\s*', '', content)
102
- content = re.sub(r'```\s*$', '', content)
103
-
104
- schema_data = json.loads(content)
105
-
106
- # Post-process: Enhance and fix data to ensure query results
107
- schema_data = enhance_sample_data(schema_data, user_query)
108
 
109
- return schema_data
110
- except Exception as e:
111
- raise Exception(f"Error generating sample data: {str(e)}")
 
 
 
 
112
 
113
- def enhance_sample_data(schema_data: dict, user_query: str) -> dict:
114
- """Enhance sample data to ensure queries return results and fix any date issues"""
115
-
116
- query_lower = user_query.lower()
117
-
118
- # Detect if query mentions time period (for order/transaction dates)
119
- time_keywords = {
120
- 'last 30 days': 30,
121
- 'last 60 days': 60,
122
- 'last 7 days': 7,
123
- 'last week': 7,
124
- 'last month': 30,
125
- 'last quarter': 90,
126
- 'last year': 365
127
- }
128
 
129
- days_back = None
130
- for keyword, days in time_keywords.items():
131
- if keyword in query_lower:
132
- days_back = days
133
- break
134
-
135
- # Detect amount/value thresholds
136
- threshold_amount = None
137
- amount_match = re.search(r'(?:over|above|greater than) \$?(\d+)', query_lower)
138
- if amount_match:
139
- threshold_amount = int(amount_match.group(1))
 
 
 
140
 
141
- # Detect text filters (department, category, status, etc.)
142
- text_filters = {}
143
 
144
- # Department detection
145
- dept_patterns = [
146
- r'(?:in|from) (?:the )?(\w+) department',
147
- r'department (?:is |= |== )?["\']?(\w+)["\']?',
148
- r'(\w+) department',
149
- ]
150
- for pattern in dept_patterns:
151
- dept_match = re.search(pattern, query_lower)
152
- if dept_match:
153
- text_filters['department'] = dept_match.group(1).capitalize()
154
- break
155
 
156
- # Category detection
157
- category_match = re.search(r'category (?:is |= )?["\']?(\w+)["\']?', query_lower)
158
- if category_match:
159
- text_filters['category'] = category_match.group(1).capitalize()
 
160
 
161
- # Status detection
162
- status_match = re.search(r'status (?:is |= )?["\']?(\w+)["\']?', query_lower)
163
- if status_match:
164
- text_filters['status'] = status_match.group(1).capitalize()
 
165
 
166
- for table in schema_data['tables']:
167
- enhanced_data = []
168
- original_data = table['sample_data']
169
-
170
- # Identify column types
171
- date_cols = [col['name'] for col in table['columns'] if col['type'] == 'DATE']
172
- amount_cols = [col['name'] for col in table['columns']
173
- if any(keyword in col['name'].lower() for keyword in ['amount', 'price', 'salary', 'total', 'cost', 'revenue'])]
174
-
175
- # Identify order/transaction date columns vs hire/created date columns
176
- transaction_date_cols = [col for col in date_cols
177
- if any(keyword in col.lower() for keyword in ['order', 'transaction', 'purchase', 'sale', 'payment'])]
178
- other_date_cols = [col for col in date_cols if col not in transaction_date_cols]
179
-
180
- for i, row in enumerate(original_data):
181
- new_row = row.copy()
182
-
183
- # FIX: Ensure transaction/order dates are in the past and within time period if specified
184
- if transaction_date_cols:
185
- for date_col in transaction_date_cols:
186
- if date_col in new_row:
187
- if days_back:
188
- # Within specified period
189
- random_days = random.randint(0, days_back)
190
- else:
191
- # Within last 60 days for transaction-type dates
192
- random_days = random.randint(0, 60)
193
- new_date = (datetime.now() - timedelta(days=random_days)).strftime('%Y-%m-%d')
194
- new_row[date_col] = new_date
195
-
196
- # FIX: Ensure other dates (hire_date, created_at, etc.) are in the PAST
197
- if other_date_cols:
198
- for date_col in other_date_cols:
199
- if date_col in new_row:
200
- try:
201
- # Check if date is in the future
202
- current_date = datetime.strptime(new_row[date_col], '%Y-%m-%d')
203
- if current_date > datetime.now():
204
- # Replace with a past date (random between 1 month to 3 years ago)
205
- random_days = random.randint(30, 1095)
206
- new_date = (datetime.now() - timedelta(days=random_days)).strftime('%Y-%m-%d')
207
- new_row[date_col] = new_date
208
- except:
209
- # If date parsing fails, generate a new past date
210
- random_days = random.randint(30, 1095)
211
- new_date = (datetime.now() - timedelta(days=random_days)).strftime('%Y-%m-%d')
212
- new_row[date_col] = new_date
213
-
214
- # Enhance amount fields to match threshold
215
- if threshold_amount and amount_cols:
216
- for amount_col in amount_cols:
217
- if amount_col in new_row:
218
- # 55% of records above threshold, 45% below
219
- if i % 100 < 55: # More deterministic distribution
220
- # Above threshold
221
- new_row[amount_col] = int(random.uniform(threshold_amount * 1.05, threshold_amount * 2.5))
222
- else:
223
- # Below threshold
224
- new_row[amount_col] = int(random.uniform(threshold_amount * 0.4, threshold_amount * 0.95))
225
-
226
- # Apply text filters to ensure enough matching records
227
- for col_name, target_value in text_filters.items():
228
- if col_name in new_row:
229
- # 55% should match the filter value
230
- if i % 100 < 55:
231
- new_row[col_name] = target_value
232
- else:
233
- # Use other values for variety
234
- if col_name == 'department':
235
- other_depts = ['Marketing', 'Sales', 'HR', 'Finance', 'Operations', 'IT']
236
- new_row[col_name] = random.choice([d for d in other_depts if d != target_value])
237
- elif col_name == 'status':
238
- other_statuses = ['Active', 'Inactive', 'Pending', 'Completed', 'Cancelled']
239
- new_row[col_name] = random.choice([s for s in other_statuses if s != target_value])
240
-
241
- enhanced_data.append(new_row)
242
-
243
- # Ensure we have at least 20 rows
244
- while len(enhanced_data) < 20:
245
- template_idx = len(enhanced_data) % len(original_data)
246
- template_row = enhanced_data[template_idx].copy()
247
-
248
- # Modify IDs to be unique
249
- for col in table['columns']:
250
- if 'id' in col['name'].lower() and col['type'] == 'INTEGER':
251
- template_row[col['name']] = len(enhanced_data) + 1
252
-
253
- enhanced_data.append(template_row)
254
-
255
- table['sample_data'] = enhanced_data
256
-
257
- return schema_data
258
 
259
- def create_tables_in_db(schema_data: dict) -> sqlite3.Connection:
260
- """Create SQLite tables and populate with sample data"""
261
- conn = sqlite3.connect(':memory:')
262
- cursor = conn.cursor()
 
 
 
 
 
 
 
 
263
 
264
- for table in schema_data['tables']:
265
- table_name = table['table_name']
266
- columns = table['columns']
267
-
268
- # Create table
269
- column_defs = []
270
- for col in columns:
271
- col_type = col['type'].upper()
272
- column_defs.append(f"{col['name']} {col_type}")
273
-
274
- create_table_sql = f"CREATE TABLE {table_name} ({', '.join(column_defs)})"
275
- cursor.execute(create_table_sql)
276
-
277
- # Insert sample data
278
- sample_data = table['sample_data']
279
- if sample_data:
280
- col_names = [col['name'] for col in columns]
281
- placeholders = ', '.join(['?' for _ in col_names])
282
- insert_sql = f"INSERT INTO {table_name} ({', '.join(col_names)}) VALUES ({placeholders})"
283
-
284
- for row in sample_data:
285
- values = [row.get(col) for col in col_names]
286
- cursor.execute(insert_sql, values)
287
 
288
- conn.commit()
289
- return conn
290
-
291
- def generate_sql_query(user_query: str, groq_api_key: str, schema_info: str) -> SQLQueryGeneration:
292
- """Generate SQL query using Groq API with schema context"""
293
  try:
294
- client = Groq(api_key=groq_api_key)
 
295
 
296
- enhanced_query = f"""Database Schema:
297
- {schema_info}
298
-
299
- User Request: {user_query}
300
-
301
- Generate a SQL query that works with the above schema. Use SQLite-compatible syntax."""
302
-
303
  response = client.chat.completions.create(
304
- model="moonshotai/kimi-k2-instruct-0905",
305
  messages=[
306
  {
307
  "role": "system",
308
- "content": "You are a SQL expert. Generate structured SQL queries from natural language descriptions with proper syntax validation and metadata. Use standard SQL syntax compatible with SQLite. For date operations, use SQLite functions like date('now') and datetime().",
309
  },
310
- {"role": "user", "content": enhanced_query},
311
  ],
312
  response_format={
313
- "type": "json_schema",
314
- "json_schema": {
315
- "name": "sql_query_generation",
316
- "schema": SQLQueryGeneration.model_json_schema()
317
- }
318
  }
319
  )
320
 
321
- sql_query_generation = SQLQueryGeneration.model_validate(
322
- json.loads(response.choices[0].message.content)
323
- )
324
- return sql_query_generation
325
- except Exception as e:
326
- raise Exception(f"Error generating SQL query: {str(e)}")
327
-
328
- def execute_sql_query(conn: sqlite3.Connection, query: str) -> pd.DataFrame:
329
- """Execute SQL query and return results as DataFrame"""
330
- try:
331
- df = pd.read_sql_query(query, conn)
332
- return df
333
- except Exception as e:
334
- raise Exception(f"Error executing SQL query: {str(e)}")
335
-
336
- def format_schema_info(schema_data: dict) -> str:
337
- """Format schema information for display"""
338
- info = []
339
- for table in schema_data['tables']:
340
- info.append(f"\nTable: {table['table_name']}")
341
- info.append("Columns:")
342
- for col in table['columns']:
343
- info.append(f" - {col['name']} ({col['type']})")
344
- info.append(f"Sample rows: {len(table['sample_data'])}")
345
- return '\n'.join(info)
346
-
347
- def process_query(user_query: str, groq_api_key: str):
348
- """Main processing function"""
349
- if not groq_api_key or not groq_api_key.strip():
350
- return "❌ Please enter your Groq API key", None, "", "", ""
351
-
352
- if not user_query or not user_query.strip():
353
- return "❌ Please enter a query", None, "", "", ""
354
-
355
- try:
356
- output_log = []
357
-
358
- # Step 1: Generate sample data
359
- output_log.append("### Step 1: Generating Sample Database Schema and Data")
360
- output_log.append(f"Query: {user_query}\n")
361
-
362
- schema_data = generate_sample_data(user_query, groq_api_key)
363
- schema_info = format_schema_info(schema_data)
364
-
365
- output_log.append("βœ… Generated database schema:")
366
- output_log.append(schema_info)
367
- output_log.append("")
368
 
369
- # Step 2: Create tables
370
- output_log.append("### Step 2: Creating In-Memory SQLite Database")
371
- conn = create_tables_in_db(schema_data)
372
- output_log.append("βœ… Tables created and populated with sample data\n")
373
-
374
- # Display sample data
375
- sample_tables_html = []
376
- for table in schema_data['tables']:
377
- df_sample = pd.DataFrame(table['sample_data'][:10]) # Show first 10 rows
378
- sample_tables_html.append(f"<h4>Sample Data from '{table['table_name']}' (first 10 rows):</h4>")
379
- sample_tables_html.append(df_sample.to_html(index=False, border=1, classes='table table-striped'))
 
 
 
 
 
380
 
381
- # Step 3: Generate SQL query
382
- output_log.append("### Step 3: Generating SQL Query")
383
- sql_generation = generate_sql_query(user_query, groq_api_key, schema_info)
 
 
384
 
385
- # Format the SQL generation output
386
- sql_output = {
387
- "query": sql_generation.query,
388
- "query_type": sql_generation.query_type,
389
- "tables_used": sql_generation.tables_used,
390
- "estimated_complexity": sql_generation.estimated_complexity,
391
- "execution_notes": sql_generation.execution_notes,
392
- "validation_status": {
393
- "is_valid": sql_generation.validation_status.is_valid,
394
- "syntax_errors": sql_generation.validation_status.syntax_errors
395
- }
396
- }
397
 
398
- sql_output_formatted = sql_output
399
- output_log.append("βœ… SQL Query Generated:\n")
 
 
 
400
 
401
- # Step 4: Execute query
402
- output_log.append("\n### Step 4: Executing SQL Query")
403
- output_log.append(f"Executing: {sql_generation.query}\n")
404
 
405
- result_df = execute_sql_query(conn, sql_generation.query)
406
 
407
- if len(result_df) == 0:
408
- output_log.append("⚠️ Query executed successfully but returned 0 rows")
409
- output_log.append("This might happen if the sample data doesn't match the query criteria.")
410
- result_html = "<p><i>No results found. The query executed successfully but no data matched the criteria.</i></p>"
411
  else:
412
- output_log.append(f"βœ… Query executed successfully! Returned {len(result_df)} row(s)\n")
413
- result_html = f"<h4>Query Results ({len(result_df)} rows):</h4>"
414
- result_html += result_df.to_html(index=False, border=1, classes='table table-striped')
 
415
 
416
  conn.close()
417
 
418
- # Combine all outputs
419
- process_log = '\n'.join(output_log)
420
- sample_data_html = '\n'.join(sample_tables_html)
 
 
 
 
421
 
422
- return process_log, sql_output_formatted, sample_data_html, result_html, ""
423
 
424
  except Exception as e:
425
- error_msg = f"❌ Error: {str(e)}"
426
- return error_msg, None, "", "", ""
427
 
428
- # Custom CSS for better table styling
429
- custom_css = """
430
- .table {
431
- width: 100%;
432
- border-collapse: collapse;
433
- margin: 10px 0;
434
- font-size: 14px;
435
- }
436
- .table th {
437
- background-color: #4a5568;
438
- color: white;
439
- font-weight: bold;
440
- padding: 10px;
441
- text-align: left;
442
- border: 1px solid #2d3748;
443
- }
444
- .table td {
445
- padding: 8px 10px;
446
- border: 1px solid #e2e8f0;
447
- }
448
- .table-striped tbody tr:nth-child(odd) {
449
- background-color: #f7fafc;
450
- }
451
- .table-striped tbody tr:nth-child(even) {
452
- background-color: #ffffff;
453
- }
454
- .table-striped tbody tr:hover {
455
- background-color: #edf2f7;
456
- }
457
- """
458
-
459
- # Gradio Interface
460
- with gr.Blocks(title="SQLGenie - AI SQL Query Generator", theme=gr.themes.Ocean(), css=custom_css) as app:
461
  gr.Markdown("""
462
- # ⚑ SQLGenie - AI SQL Query Generator & Executor
463
 
464
- Transform natural language into SQL queries and see instant results! This app:
465
- 1. 🎲 Generates realistic sample database tables based on your query
466
- 2. πŸ§™ Creates a structured SQL query from natural language using AI
467
- 3. βš™οΈ Executes the query on sample data
468
- 4. πŸ“Š Shows you the results instantly
469
 
470
- ### How to use:
471
- 1. Enter your Groq API key ([Get one free here](https://console.groq.com/keys))
472
- 2. Describe what data you want in plain English
473
- 3. Click "Generate & Execute SQL" and watch the magic happen! ✨
474
  """)
475
 
476
  with gr.Row():
477
- with gr.Column(scale=2):
478
  api_key_input = gr.Textbox(
479
  label="πŸ”‘ Groq API Key",
 
480
  placeholder="Enter your Groq API key here...",
481
- type="password"
482
  )
483
 
484
  query_input = gr.Textbox(
485
  label="πŸ’¬ Natural Language Query",
486
- placeholder="Example: Find all customers who made orders over $500 in the last 30 days, show their name, email, and total order amount",
487
  lines=3
488
  )
489
 
490
  submit_btn = gr.Button("πŸš€ Generate & Execute SQL", variant="primary", size="lg")
 
 
 
491
 
492
  with gr.Row():
493
  with gr.Column():
494
- gr.Markdown("### πŸ“‹ Process Log")
495
- process_output = gr.Textbox(
496
- label="Execution Steps",
497
- lines=12,
498
- max_lines=20
499
- )
500
-
501
- with gr.Row():
502
- with gr.Column():
503
- gr.Markdown("### πŸ—‚οΈ Sample Database Tables")
504
- sample_data_output = gr.HTML(label="Sample Data")
505
 
506
  with gr.Row():
507
  with gr.Column():
508
- gr.Markdown("### πŸ“ Generated SQL Query (Structured Output)")
509
- sql_output = gr.JSON(label="SQL Query Metadata")
510
 
511
  with gr.Row():
512
  with gr.Column():
513
- gr.Markdown("### ✨ Query Execution Results")
514
- result_output = gr.HTML(label="Results")
515
-
516
- # Examples
517
- gr.Examples(
518
- examples=[
519
- ["Find all customers who made orders over $500 in the last 30 days, show their name, email, and total order amount"],
520
- ["List all products that are out of stock along with their supplier information"],
521
- ["Show the top 5 employees by total sales in the last quarter"],
522
- ["Find all students who scored above 85% in Mathematics and their contact details"],
523
- ["Get all active users who haven't logged in for more than 60 days"],
524
- ["Show all transactions above $1000 in the last week with customer details"],
525
- ["Find employees in the Engineering department with salary over $80000"]
526
- ],
527
- inputs=query_input,
528
- label="πŸ’‘ Example Queries - Click to try!"
529
- )
530
 
 
531
  submit_btn.click(
532
- fn=process_query,
533
- inputs=[query_input, api_key_input],
534
- outputs=[process_output, sql_output, sample_data_output, result_output, gr.Textbox(visible=False)]
535
  )
536
 
537
  gr.Markdown("""
538
  ---
539
- ### 🎯 Tips for Best Results:
540
- - Be specific about time periods (e.g., "last 30 days", "last quarter")
541
- - Mention thresholds clearly (e.g., "over $500", "above 85%")
542
- - Specify what fields you want to see (e.g., "show name, email, total")
543
- - The app generates realistic sample data automatically to match your query!
 
 
 
 
 
 
 
 
 
 
544
  """)
545
 
 
546
  if __name__ == "__main__":
547
- app.launch()
 
4
  import json
5
  import sqlite3
6
  import pandas as pd
 
 
7
  from datetime import datetime, timedelta
8
  import random
9
 
10
+ # Pydantic models for structured output
11
  class ValidationStatus(BaseModel):
12
  is_valid: bool
13
  syntax_errors: list[str]
 
20
  execution_notes: list[str]
21
  validation_status: ValidationStatus
22
 
23
+ # Sample data generators
24
+ def generate_sample_customers(count=10):
25
+ """Generate sample customer data"""
26
+ first_names = ["Alice", "Bob", "Carol", "David", "Emma", "Frank", "Grace", "Henry", "Ivy", "Jack"]
27
+ last_names = ["Johnson", "Smith", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
28
+
29
+ customers = []
30
+ for i in range(1, count + 1):
31
+ fname = random.choice(first_names)
32
+ lname = random.choice(last_names)
33
+ customers.append({
34
+ 'customer_id': i,
35
+ 'name': f"{fname} {lname}",
36
+ 'email': f"{fname.lower()}{i}@example.com"
37
+ })
38
+ return customers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ def generate_sample_orders(customer_count=10, order_count=20):
41
+ """Generate sample order data"""
42
+ orders = []
43
+ base_date = datetime.now()
44
+
45
+ for i in range(1, order_count + 1):
46
+ days_ago = random.randint(0, 60)
47
+ order_date = (base_date - timedelta(days=days_ago)).strftime('%Y-%m-%d')
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ orders.append({
50
+ 'order_id': 100 + i,
51
+ 'customer_id': random.randint(1, customer_count),
52
+ 'total_amount': random.choice([250, 350, 450, 600, 800, 1200, 1500, 300]),
53
+ 'order_date': order_date
54
+ })
55
+ return orders
56
 
57
+ def generate_sample_products(count=15):
58
+ """Generate sample product data"""
59
+ products = []
60
+ categories = ["Electronics", "Clothing", "Home", "Sports", "Books"]
61
+ product_names = ["Widget", "Gadget", "Tool", "Item", "Device"]
 
 
 
 
 
 
 
 
 
 
62
 
63
+ for i in range(1, count + 1):
64
+ products.append({
65
+ 'product_id': i,
66
+ 'product_name': f"{random.choice(product_names)} {i}",
67
+ 'category': random.choice(categories),
68
+ 'price': round(random.uniform(10, 500), 2),
69
+ 'stock_quantity': random.randint(0, 100)
70
+ })
71
+ return products
72
+
73
+ def create_database_from_tables(tables_used):
74
+ """Create SQLite database with sample data based on tables mentioned in query"""
75
+ conn = sqlite3.connect(':memory:')
76
+ cursor = conn.cursor()
77
 
78
+ sample_data = {}
 
79
 
80
+ # Generate data based on tables mentioned
81
+ if 'customers' in tables_used:
82
+ customers = generate_sample_customers(10)
83
+ df_customers = pd.DataFrame(customers)
84
+ df_customers.to_sql('customers', conn, index=False, if_exists='replace')
85
+ sample_data['customers'] = df_customers
 
 
 
 
 
86
 
87
+ if 'orders' in tables_used:
88
+ orders = generate_sample_orders(10, 20)
89
+ df_orders = pd.DataFrame(orders)
90
+ df_orders.to_sql('orders', conn, index=False, if_exists='replace')
91
+ sample_data['orders'] = df_orders
92
 
93
+ if 'products' in tables_used:
94
+ products = generate_sample_products(15)
95
+ df_products = pd.DataFrame(products)
96
+ df_products.to_sql('products', conn, index=False, if_exists='replace')
97
+ sample_data['products'] = df_products
98
 
99
+ return conn, sample_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ def execute_sql_on_sample_data(sql_query, conn):
102
+ """Execute the generated SQL query on sample database"""
103
+ try:
104
+ df_result = pd.read_sql_query(sql_query, conn)
105
+ return df_result, None
106
+ except Exception as e:
107
+ return None, str(e)
108
+
109
+ def process_nl_query(api_key, natural_query):
110
+ """Main function to process natural language query"""
111
+ if not api_key:
112
+ return "❌ Please enter your Groq API key", "", "", ""
113
 
114
+ if not natural_query:
115
+ return "❌ Please enter a natural language query", "", "", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
 
 
 
 
 
117
  try:
118
+ # Initialize Groq client
119
+ client = Groq(api_key=api_key)
120
 
121
+ # Step 1: Generate SQL from natural language
122
+ output_text = "## πŸ“‹ STEP-BY-STEP PROCESS\n\n"
123
+ output_text += "### Step 1: Understanding User Intent\n"
124
+ output_text += f"**User Query:** {natural_query}\n\n"
125
+
126
+ # Call Groq API for SQL generation
 
127
  response = client.chat.completions.create(
128
+ model="mixtral-8x7b-32768",
129
  messages=[
130
  {
131
  "role": "system",
132
+ "content": "You are a SQL expert. Generate structured SQL queries from natural language descriptions with proper syntax validation and metadata. Use standard SQL syntax compatible with SQLite.",
133
  },
134
+ {"role": "user", "content": natural_query},
135
  ],
136
  response_format={
137
+ "type": "json_object"
 
 
 
 
138
  }
139
  )
140
 
141
+ # Parse the response
142
+ response_content = response.choices[0].message.content
143
+ sql_data = json.loads(response_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ # Try to map to our Pydantic model
146
+ try:
147
+ sql_query_gen = SQLQueryGeneration(**sql_data)
148
+ except:
149
+ # If response doesn't match exact schema, create it manually
150
+ sql_query_gen = SQLQueryGeneration(
151
+ query=sql_data.get('query', ''),
152
+ query_type=sql_data.get('query_type', 'SELECT'),
153
+ tables_used=sql_data.get('tables_used', []),
154
+ estimated_complexity=sql_data.get('estimated_complexity', 'medium'),
155
+ execution_notes=sql_data.get('execution_notes', []),
156
+ validation_status=ValidationStatus(
157
+ is_valid=sql_data.get('validation_status', {}).get('is_valid', True),
158
+ syntax_errors=sql_data.get('validation_status', {}).get('syntax_errors', [])
159
+ )
160
+ )
161
 
162
+ # Step 2: Display Structured SQL Output
163
+ output_text += "### Step 2: Generated Structured SQL\n\n"
164
+ output_text += "```json\n"
165
+ output_text += json.dumps(sql_query_gen.model_dump(), indent=2)
166
+ output_text += "\n```\n\n"
167
 
168
+ # Step 3: Generate Sample Database Tables
169
+ output_text += "### Step 3: Auto-Generated Sample Database Tables\n\n"
170
+ conn, sample_data = create_database_from_tables(sql_query_gen.tables_used)
 
 
 
 
 
 
 
 
 
171
 
172
+ # Display sample tables
173
+ for table_name, df in sample_data.items():
174
+ output_text += f"**πŸ“Š Sample `{table_name}` Table:**\n\n"
175
+ output_text += df.to_markdown(index=False)
176
+ output_text += "\n\n"
177
 
178
+ # Step 4: Execute SQL Query
179
+ output_text += "### Step 4: Execute Generated SQL on Sample Tables\n\n"
180
+ output_text += f"**SQL Query:**\n```sql\n{sql_query_gen.query}\n```\n\n"
181
 
182
+ result_df, error = execute_sql_on_sample_data(sql_query_gen.query, conn)
183
 
184
+ if error:
185
+ output_text += f"❌ **Execution Error:** {error}\n"
186
+ result_table = None
 
187
  else:
188
+ output_text += "βœ… **Query executed successfully!**\n\n"
189
+ output_text += "**πŸ“ˆ SQL Execution Result:**\n\n"
190
+ output_text += result_df.to_markdown(index=False)
191
+ result_table = result_df
192
 
193
  conn.close()
194
 
195
+ # Format outputs for Gradio
196
+ json_output = json.dumps(sql_query_gen.model_dump(), indent=2)
197
+
198
+ if result_df is not None:
199
+ result_display = result_df
200
+ else:
201
+ result_display = pd.DataFrame({"Error": [error]})
202
 
203
+ return output_text, json_output, result_display, sql_query_gen.query
204
 
205
  except Exception as e:
206
+ error_msg = f"❌ **Error:** {str(e)}\n\nPlease check your API key and query."
207
+ return error_msg, "", pd.DataFrame(), ""
208
 
209
+ # Create Gradio Interface
210
+ with gr.Blocks(title="Natural Language to SQL Query Executor", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  gr.Markdown("""
212
+ # πŸ” Natural Language to SQL Query Executor
213
 
214
+ Convert natural language queries into SQL, generate sample data, and execute queries automatically!
 
 
 
 
215
 
216
+ **Example queries to try:**
217
+ - "Find all customers who made orders over $500 in the last 30 days, show their name, email, and total order amount"
218
+ - "Show all products with stock quantity less than 10"
219
+ - "List top 5 customers by total order amount"
220
  """)
221
 
222
  with gr.Row():
223
+ with gr.Column(scale=1):
224
  api_key_input = gr.Textbox(
225
  label="πŸ”‘ Groq API Key",
226
+ type="password",
227
  placeholder="Enter your Groq API key here...",
228
+ info="Get your API key from https://console.groq.com"
229
  )
230
 
231
  query_input = gr.Textbox(
232
  label="πŸ’¬ Natural Language Query",
233
+ placeholder="e.g., Find all customers who made orders over $500 in the last 30 days...",
234
  lines=3
235
  )
236
 
237
  submit_btn = gr.Button("πŸš€ Generate & Execute SQL", variant="primary", size="lg")
238
+
239
+ gr.Markdown("### πŸ“ Generated SQL Query")
240
+ sql_output = gr.Code(label="SQL Query", language="sql")
241
 
242
  with gr.Row():
243
  with gr.Column():
244
+ gr.Markdown("### πŸ“Š Process & Results")
245
+ process_output = gr.Markdown()
 
 
 
 
 
 
 
 
 
246
 
247
  with gr.Row():
248
  with gr.Column():
249
+ gr.Markdown("### 🎯 Structured JSON Output")
250
+ json_output = gr.Code(label="JSON Response", language="json")
251
 
252
  with gr.Row():
253
  with gr.Column():
254
+ gr.Markdown("### πŸ“ˆ Query Execution Result")
255
+ result_output = gr.Dataframe(
256
+ label="Result Table",
257
+ interactive=False
258
+ )
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ # Connect the button to the processing function
261
  submit_btn.click(
262
+ fn=process_nl_query,
263
+ inputs=[api_key_input, query_input],
264
+ outputs=[process_output, json_output, result_output, sql_output]
265
  )
266
 
267
  gr.Markdown("""
268
  ---
269
+ ### πŸ“– How it works:
270
+ 1. **Enter your Groq API key** - Required for SQL generation
271
+ 2. **Write your query in plain English** - Describe what data you want to find
272
+ 3. **Click Generate & Execute** - The system will:
273
+ - Convert your query to SQL
274
+ - Generate sample database tables
275
+ - Execute the query
276
+ - Show you the results
277
+
278
+ ### 🎯 Features:
279
+ - βœ… Natural language to SQL conversion
280
+ - βœ… Automatic sample data generation
281
+ - βœ… Query validation and metadata
282
+ - βœ… SQL execution on sample data
283
+ - βœ… Structured JSON output format
284
  """)
285
 
286
+ # Launch the app
287
  if __name__ == "__main__":
288
+ demo.launch()