Spaces:

shukdevdattaEX
/

SQLGenie

Paused

App Files Files Community

shukdevdattaEX commited on 21 days ago

Commit

9910f72

verified ·

1 Parent(s): 82fb5aa

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -41

app.py CHANGED Viewed

@@ -34,11 +34,13 @@ def generate_sample_data(user_query: str, groq_api_key: str) -> dict:
         # Get current date for context
         today = datetime.now().strftime('%Y-%m-%d')
         # Request to generate table schema and sample data
         schema_prompt = f"""Based on this query: "{user_query}"
-Current date: {today}
 Generate a realistic database schema with sample data. Return ONLY valid JSON with this structure:
 {{
@@ -51,26 +53,43 @@ Generate a realistic database schema with sample data. Return ONLY valid JSON wi
       ],
       "sample_data": [
         {{"column_name": value, ...}},
-        ...at least 15-20 rows
       ]
     }}
   ]
 }}
-IMPORTANT INSTRUCTIONS FOR REALISTIC DATA:
-1. For DATE columns: Use dates in format 'YYYY-MM-DD'. Include dates from the last 60 days to ensure some fall within "last 30 days"
-2. For queries mentioning "last X days": Generate at least 50% of dates within that timeframe
-3. For queries with amount/price filters (e.g., "over $500"): Ensure at least 40% of records meet the criteria
-4. For queries with thresholds: Create data both above AND below the threshold
-5. Use realistic names, emails, and values
-6. Make sure there's enough variety in the data to produce meaningful query results
-Example date range: from {(datetime.now() - timedelta(days=60)).strftime('%Y-%m-%d')} to {today}"""
         response = client.chat.completions.create(
             model="moonshotai/kimi-k2-instruct-0905",
             messages=[
-                {"role": "system", "content": "You are a database expert. Generate realistic table schemas and sample data that will produce meaningful query results. Return ONLY valid JSON, no markdown formatting."},
                 {"role": "user", "content": schema_prompt}
             ],
             temperature=0.7
@@ -84,7 +103,7 @@ Example date range: from {(datetime.now() - timedelta(days=60)).strftime('%Y-%m-
         schema_data = json.loads(content)
-        # Post-process: Enhance data to ensure query results
         schema_data = enhance_sample_data(schema_data, user_query)
         return schema_data
@@ -92,9 +111,11 @@ Example date range: from {(datetime.now() - timedelta(days=60)).strftime('%Y-%m-
         raise Exception(f"Error generating sample data: {str(e)}")
 def enhance_sample_data(schema_data: dict, user_query: str) -> dict:
-    """Enhance sample data to ensure queries return results"""
-    # Detect if query mentions time period
     time_keywords = {
         'last 30 days': 30,
         'last 60 days': 60,
@@ -107,51 +128,122 @@ def enhance_sample_data(schema_data: dict, user_query: str) -> dict:
     days_back = None
     for keyword, days in time_keywords.items():
-        if keyword in user_query.lower():
             days_back = days
             break
     # Detect amount/value thresholds
-    amount_pattern = r'\$?(\d+)'
-    amount_match = re.search(r'over \$?(\d+)', user_query.lower())
-    threshold_amount = int(amount_match.group(1)) if amount_match else None
     for table in schema_data['tables']:
         enhanced_data = []
         original_data = table['sample_data']
-        # Find date and amount columns
         date_cols = [col['name'] for col in table['columns'] if col['type'] == 'DATE']
-        amount_cols = [col['name'] for col in table['columns'] if 'amount' in col['name'].lower() or 'price' in col['name'].lower() or 'total' in col['name'].lower()]
-        for row in original_data:
             new_row = row.copy()
-            # Enhance date fields to be within the time period
-            if days_back and date_cols:
-                for date_col in date_cols:
                     if date_col in new_row:
-                        # Generate random date within the period
-                        random_days = random.randint(0, days_back)
                         new_date = (datetime.now() - timedelta(days=random_days)).strftime('%Y-%m-%d')
                         new_row[date_col] = new_date
-            # Enhance amount fields to exceed threshold
             if threshold_amount and amount_cols:
                 for amount_col in amount_cols:
                     if amount_col in new_row:
-                        # 60% of records above threshold, 40% below
-                        if random.random() < 0.6:
-                            new_row[amount_col] = round(random.uniform(threshold_amount * 1.1, threshold_amount * 3), 2)
                         else:
-                            new_row[amount_col] = round(random.uniform(threshold_amount * 0.3, threshold_amount * 0.9), 2)
             enhanced_data.append(new_row)
-        # Add more rows if needed (ensure at least 15 rows)
-        while len(enhanced_data) < 15:
-            # Duplicate and modify existing rows
-            template_row = enhanced_data[len(enhanced_data) % len(original_data)].copy()
             # Modify IDs to be unique
             for col in table['columns']:
@@ -339,20 +431,28 @@ custom_css = """
     width: 100%;
     border-collapse: collapse;
     margin: 10px 0;
 }
 .table th {
-    background-color: #f0f0f0;
     font-weight: bold;
-    padding: 8px;
     text-align: left;
-    border: 1px solid #ddd;
 }
 .table td {
-    padding: 8px;
-    border: 1px solid #ddd;
 }
 .table-striped tbody tr:nth-child(odd) {
-    background-color: #f9f9f9;
 }
 """

         # Get current date for context
         today = datetime.now().strftime('%Y-%m-%d')
+        past_date_2y = (datetime.now() - timedelta(days=730)).strftime('%Y-%m-%d')  # 2 years ago
+        past_date_60d = (datetime.now() - timedelta(days=60)).strftime('%Y-%m-%d')  # 60 days ago
         # Request to generate table schema and sample data
         schema_prompt = f"""Based on this query: "{user_query}"
+**Current date: {today}**
 Generate a realistic database schema with sample data. Return ONLY valid JSON with this structure:
 {{
       ],
       "sample_data": [
         {{"column_name": value, ...}},
+        ...at least 20-25 rows
       ]
     }}
   ]
 }}
+**CRITICAL INSTRUCTIONS FOR REALISTIC DATA:**
+1. **DATES MUST BE IN THE PAST!**
+   - For hire_date, created_at, registration_date: Use dates between {past_date_2y} and {today}
+   - For order_date, transaction_date: If query mentions "last X days", use dates between {past_date_60d} and {today}
+   - NEVER use future dates!
+2. **For NUMERIC filters (salary, amount, price):**
+   - If query says "over $80000", make 50-60% of records have values ABOVE 80000
+   - Create realistic variation: some at 85k, some at 95k, some at 120k, etc.
+   - Also include records BELOW the threshold (40-50%) for realism
+3. **For TEXT filters (department, category, status):**
+   - If query mentions "Engineering department", ensure 50-60% of records have department = "Engineering"
+   - Include other departments too: "Marketing", "Sales", "HR", "Finance" for variety
+4. **Data quality:**
+   - Use realistic names, emails (first.last@company.com format)
+   - Make data diverse and meaningful
+   - Ensure enough records match the query criteria to get meaningful results
+Example: For "Find Engineering employees with salary > 80000"
+- Create 20+ employee records
+- 12-15 should be in Engineering (60%)
+- Of Engineering employees, 8-10 should have salary > 80000
+- Include other departments with various salaries for realism"""
         response = client.chat.completions.create(
             model="moonshotai/kimi-k2-instruct-0905",
             messages=[
+                {"role": "system", "content": "You are a database expert. Generate realistic table schemas and sample data. ALL DATES MUST BE IN THE PAST, NEVER IN THE FUTURE. Return ONLY valid JSON, no markdown formatting."},
                 {"role": "user", "content": schema_prompt}
             ],
             temperature=0.7
         schema_data = json.loads(content)
+        # Post-process: Enhance and fix data to ensure query results
         schema_data = enhance_sample_data(schema_data, user_query)
         return schema_data
         raise Exception(f"Error generating sample data: {str(e)}")
 def enhance_sample_data(schema_data: dict, user_query: str) -> dict:
+    """Enhance sample data to ensure queries return results and fix any date issues"""
+    query_lower = user_query.lower()
+    # Detect if query mentions time period (for order/transaction dates)
     time_keywords = {
         'last 30 days': 30,
         'last 60 days': 60,
     days_back = None
     for keyword, days in time_keywords.items():
+        if keyword in query_lower:
             days_back = days
             break
     # Detect amount/value thresholds
+    threshold_amount = None
+    amount_match = re.search(r'(?:over|above|greater than) \$?(\d+)', query_lower)
+    if amount_match:
+        threshold_amount = int(amount_match.group(1))
+    # Detect text filters (department, category, status, etc.)
+    text_filters = {}
+    # Department detection
+    dept_patterns = [
+        r'(?:in|from) (?:the )?(\w+) department',
+        r'department (?:is |= |== )?["\']?(\w+)["\']?',
+        r'(\w+) department',
+    ]
+    for pattern in dept_patterns:
+        dept_match = re.search(pattern, query_lower)
+        if dept_match:
+            text_filters['department'] = dept_match.group(1).capitalize()
+            break
+    # Category detection
+    category_match = re.search(r'category (?:is |= )?["\']?(\w+)["\']?', query_lower)
+    if category_match:
+        text_filters['category'] = category_match.group(1).capitalize()
+    # Status detection
+    status_match = re.search(r'status (?:is |= )?["\']?(\w+)["\']?', query_lower)
+    if status_match:
+        text_filters['status'] = status_match.group(1).capitalize()
     for table in schema_data['tables']:
         enhanced_data = []
         original_data = table['sample_data']
+        # Identify column types
         date_cols = [col['name'] for col in table['columns'] if col['type'] == 'DATE']
+        amount_cols = [col['name'] for col in table['columns']
+                      if any(keyword in col['name'].lower() for keyword in ['amount', 'price', 'salary', 'total', 'cost', 'revenue'])]
+        # Identify order/transaction date columns vs hire/created date columns
+        transaction_date_cols = [col for col in date_cols
+                                if any(keyword in col.lower() for keyword in ['order', 'transaction', 'purchase', 'sale', 'payment'])]
+        other_date_cols = [col for col in date_cols if col not in transaction_date_cols]
+        for i, row in enumerate(original_data):
             new_row = row.copy()
+            # FIX: Ensure transaction/order dates are in the past and within time period if specified
+            if transaction_date_cols:
+                for date_col in transaction_date_cols:
                     if date_col in new_row:
+                        if days_back:
+                            # Within specified period
+                            random_days = random.randint(0, days_back)
+                        else:
+                            # Within last 60 days for transaction-type dates
+                            random_days = random.randint(0, 60)
                         new_date = (datetime.now() - timedelta(days=random_days)).strftime('%Y-%m-%d')
                         new_row[date_col] = new_date
+            # FIX: Ensure other dates (hire_date, created_at, etc.) are in the PAST
+            if other_date_cols:
+                for date_col in other_date_cols:
+                    if date_col in new_row:
+                        try:
+                            # Check if date is in the future
+                            current_date = datetime.strptime(new_row[date_col], '%Y-%m-%d')
+                            if current_date > datetime.now():
+                                # Replace with a past date (random between 1 month to 3 years ago)
+                                random_days = random.randint(30, 1095)
+                                new_date = (datetime.now() - timedelta(days=random_days)).strftime('%Y-%m-%d')
+                                new_row[date_col] = new_date
+                        except:
+                            # If date parsing fails, generate a new past date
+                            random_days = random.randint(30, 1095)
+                            new_date = (datetime.now() - timedelta(days=random_days)).strftime('%Y-%m-%d')
+                            new_row[date_col] = new_date
+            # Enhance amount fields to match threshold
             if threshold_amount and amount_cols:
                 for amount_col in amount_cols:
                     if amount_col in new_row:
+                        # 55% of records above threshold, 45% below
+                        if i % 100 < 55:  # More deterministic distribution
+                            # Above threshold
+                            new_row[amount_col] = int(random.uniform(threshold_amount * 1.05, threshold_amount * 2.5))
                         else:
+                            # Below threshold
+                            new_row[amount_col] = int(random.uniform(threshold_amount * 0.4, threshold_amount * 0.95))
+            # Apply text filters to ensure enough matching records
+            for col_name, target_value in text_filters.items():
+                if col_name in new_row:
+                    # 55% should match the filter value
+                    if i % 100 < 55:
+                        new_row[col_name] = target_value
+                    else:
+                        # Use other values for variety
+                        if col_name == 'department':
+                            other_depts = ['Marketing', 'Sales', 'HR', 'Finance', 'Operations', 'IT']
+                            new_row[col_name] = random.choice([d for d in other_depts if d != target_value])
+                        elif col_name == 'status':
+                            other_statuses = ['Active', 'Inactive', 'Pending', 'Completed', 'Cancelled']
+                            new_row[col_name] = random.choice([s for s in other_statuses if s != target_value])
             enhanced_data.append(new_row)
+        # Ensure we have at least 20 rows
+        while len(enhanced_data) < 20:
+            template_idx = len(enhanced_data) % len(original_data)
+            template_row = enhanced_data[template_idx].copy()
             # Modify IDs to be unique
             for col in table['columns']:
     width: 100%;
     border-collapse: collapse;
     margin: 10px 0;
+    font-size: 14px;
 }
 .table th {
+    background-color: #4a5568;
+    color: white;
     font-weight: bold;
+    padding: 10px;
     text-align: left;
+    border: 1px solid #2d3748;
 }
 .table td {
+    padding: 8px 10px;
+    border: 1px solid #e2e8f0;
 }
 .table-striped tbody tr:nth-child(odd) {
+    background-color: #f7fafc;
+}
+.table-striped tbody tr:nth-child(even) {
+    background-color: #ffffff;
+}
+.table-striped tbody tr:hover {
+    background-color: #edf2f7;
 }
 """