Spaces:

Balaprime
/

NL2SQL

Build error

App Files Files Community

Balaprime commited on May 16, 2025

Commit

2401c92

verified ·

1 Parent(s): e7f0576

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -307

app.py CHANGED Viewed

@@ -1,309 +1,158 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-import re
-import sqlparse
-# Load model and tokenizer
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = AutoModelForCausalLM.from_pretrained(
-    "onkolahmet/Qwen2-0.5B-Instruct-SQL-generator",
-    torch_dtype="auto",
-    device_map="auto"
 )
-tokenizer = AutoTokenizer.from_pretrained("onkolahmet/Qwen2-0.5B-Instruct-SQL-generator")
-# # Few-shot examples to include in each prompt
-# examples = [
-#     {
-#         "question": "Get the names and emails of customers who placed an order in the last 30 days.",
-#         "sql": "SELECT name, email FROM customers WHERE order_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY);"
-#     },
-#     {
-#         "question": "Find all employees with a salary greater than 50000.",
-#         "sql": "SELECT * FROM employees WHERE salary > 50000;"
-#     },
-#     {
-#         "question": "List all product names and their categories where the price is below 50.",
-#         "sql": "SELECT name, category FROM products WHERE price < 50;"
-#     },
-#     {
-#         "question": "How many users registered in the year 2022?",
-#         "sql": "SELECT COUNT(*) FROM users WHERE YEAR(registration_date) = 2022;"
-#     }
-# ]
-def generate_sql(question, context=None):
-    # Construct prompt with few-shot examples and context if available
-    prompt = "Translate natural language questions to SQL queries.\n\n"
-    # Add table context if available
-    if context and context.strip():
-        prompt += f"Table Context:\n{context}\n\n"
-    # # Add few-shot examples
-    # for ex in examples:
-    #     prompt += f"Q: {ex['question']}\nSQL: {ex['sql']}\n\n"
-    # Add the current question
-    prompt += f"Q: {question}\nSQL:"
-    # Tokenize and generate
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    # Generate SQL query
-    outputs = model.generate(
-        inputs.input_ids,
-        max_new_tokens=128,
-        do_sample=True,
-        eos_token_id=tokenizer.eos_token_id
-    )
-    # Extract and decode only the new generation
-    sql_query = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-    return sql_query.strip()
-def clean_sql_output(sql_text):
-    """
-    Clean and deduplicate SQL queries:
-    1. Remove comments
-    2. Remove duplicate queries
-    3. Extract only the most relevant query
-    4. Format properly
-    """
-    # Remove SQL comments (both single line and multi-line)
-    sql_text = re.sub(r'--.*?$', '', sql_text, flags=re.MULTILINE)
-    sql_text = re.sub(r'/\*.*?\*/', '', sql_text, flags=re.DOTALL)
-    # Remove markdown code block syntax if present
-    sql_text = re.sub(r'```sql|```', '', sql_text)
-    # Split into individual queries if multiple exist
-    if ';' in sql_text:
-        queries = [q.strip() for q in sql_text.split(';') if q.strip()]
-    else:
-        # If no semicolons, try to identify separate queries by SELECT statements
-        sql_text_cleaned = re.sub(r'\s+', ' ', sql_text)
-        select_matches = list(re.finditer(r'SELECT\s+', sql_text_cleaned, re.IGNORECASE))
-        if len(select_matches) > 1:
-            queries = []
-            for i in range(len(select_matches)):
-                start = select_matches[i].start()
-                end = select_matches[i+1].start() if i < len(select_matches) - 1 else len(sql_text_cleaned)
-                queries.append(sql_text_cleaned[start:end].strip())
-        else:
-            queries = [sql_text]
-    # Remove empty queries
-    queries = [q for q in queries if q.strip()]
-    if not queries:
-        return ""
-    # If we have multiple queries, need to deduplicate
-    if len(queries) > 1:
-        # Normalize queries for comparison (lowercase, remove extra spaces)
-        normalized_queries = []
-        for q in queries:
-            # Use sqlparse to format and normalize
-            try:
-                formatted = sqlparse.format(
-                    q + ('' if q.strip().endswith(';') else ';'),
-                    keyword_case='lower',
-                    identifier_case='lower',
-                    strip_comments=True,
-                    reindent=True
-                )
-                normalized_queries.append(formatted)
-            except:
-                # If sqlparse fails, just do basic normalization
-                normalized = re.sub(r'\s+', ' ', q.lower().strip())
-                normalized_queries.append(normalized)
-        # Find unique queries
-        unique_queries = []
-        unique_normalized = []
-        for i, norm_q in enumerate(normalized_queries):
-            if norm_q not in unique_normalized:
-                unique_normalized.append(norm_q)
-                unique_queries.append(queries[i])
-        # Choose the most likely correct query:
-        # 1. Prefer queries with SELECT
-        # 2. Prefer longer queries (often more detailed)
-        # 3. Prefer first query if all else equal
-        select_queries = [q for q in unique_queries if re.search(r'SELECT\s+', q, re.IGNORECASE)]
-        if select_queries:
-            # Choose the longest SELECT query (likely most detailed)
-            best_query = max(select_queries, key=len)
-        elif unique_queries:
-            # If no SELECT queries, choose the longest query
-            best_query = max(unique_queries, key=len)
-        else:
-            # Fallback to the first query
-            best_query = queries[0]
-    else:
-        best_query = queries[0]
-    # Clean up the chosen query
-    best_query = best_query.strip()
-    if not best_query.endswith(';'):
-        best_query += ';'
-    # Final formatting to ensure consistent spacing
-    best_query = re.sub(r'\s+', ' ', best_query)
-    try:
-        # Use sqlparse to nicely format the SQL for display
-        formatted_sql = sqlparse.format(
-            best_query,
-            keyword_case='upper',
-            identifier_case='lower',
-            reindent=True,
-            indent_width=2
-        )
-        return formatted_sql
-    except:
-        return best_query
-def process_input(question, table_context):
-    """Function to process user input through the model and return formatted results"""
-    if not question.strip():
-        return "Please enter a question."
-    # Generate SQL from the question and context
-    raw_sql = generate_sql(question, table_context)
-    # Clean the SQL output
-    cleaned_sql = clean_sql_output(raw_sql)
-    if not cleaned_sql:
-        return "Sorry, I couldn't generate a valid SQL query. Please try rephrasing your question."
-    return cleaned_sql
-# Sample table context examples for the example selector
-example_contexts = [
-    # Example 1
-    """
-CREATE TABLE customers (
-  id INT PRIMARY KEY,
-  name VARCHAR(100),
-  email VARCHAR(100),
-  order_date DATE
-);
-    """,
-    # Example 2
-    """
-CREATE TABLE products (
-  id INT PRIMARY KEY,
-  name VARCHAR(100),
-  category VARCHAR(50),
-  price DECIMAL(10,2),
-  stock_quantity INT
-);
-    """,
-    # Example 3
-    """
-CREATE TABLE employees (
-  id INT PRIMARY KEY,
-  name VARCHAR(100),
-  department VARCHAR(50),
-  salary DECIMAL(10,2),
-  hire_date DATE
-);
-CREATE TABLE departments (
-  id INT PRIMARY KEY,
-  name VARCHAR(50),
-  manager_id INT,
-  budget DECIMAL(15,2)
-);
-    """
-]
-# Sample question examples
-example_questions = [
-    "Get the names and emails of customers who placed an order in the last 30 days.",
-    "Find all products with less than 10 items in stock.",
-    "List all employees in the Sales department with a salary greater than 50000.",
-    "What is the total budget for departments with more than 5 employees?",
-    "Count how many products are in each category where the price is greater than 100."
-]
-# Create the Gradio interface
-with gr.Blocks(title="Text to SQL Converter") as demo:
-    gr.Markdown("# Text to SQL Query Converter")
-    gr.Markdown("Enter your question and optional table context to generate an SQL query.")
-    with gr.Row():
-        with gr.Column():
-            question_input = gr.Textbox(
-                label="Your Question",
-                placeholder="e.g., Find all products with price less than $50",
-                lines=2
-            )
-            table_context = gr.Textbox(
-                label="Table Context (Optional)",
-                placeholder="Enter your database schema or table definitions here...",
-                lines=10
-            )
-            submit_btn = gr.Button("Generate SQL Query")
-        with gr.Column():
-            sql_output = gr.Code(
-                label="Generated SQL Query",
-                language="sql",
-                lines=12
-            )
-    # Examples section
-    gr.Markdown("### Try some examples")
-    example_selector = gr.Examples(
-        examples=[
-            ["List all products in the 'Electronics' category with price less than $500", example_contexts[1]],
-            ["Find the total number of employees in each department", example_contexts[2]],
-            ["Get customers who placed orders in the last 7 days", example_contexts[0]],
-            ["Count the number of products in each category", example_contexts[1]],
-            ["Find the average salary by department", example_contexts[2]]
-        ],
-        inputs=[question_input, table_context]
-    )
-    # Set up the submit button to trigger the process_input function
-    submit_btn.click(
-        fn=process_input,
-        inputs=[question_input, table_context],
-        outputs=sql_output
-    )
-    # Also trigger on pressing Enter in the question input
-    question_input.submit(
-        fn=process_input,
-        inputs=[question_input, table_context],
-        outputs=sql_output
-    )
-    # Add information about the model
-    gr.Markdown("""
-    ### About
-    This app uses a fine-tuned language model to convert natural language questions into SQL queries.
-    - **Model**: [onkolahmet/Qwen2-0.5B-Instruct-SQL-generator](https://huggingface.co/onkolahmet/Qwen2-0.5B-Instruct-SQL-generator)
-    - **How to use**:
-      1. Enter your question in natural language
-      2. If you have specific table schemas, add them in the Table Context field
-      3. Click "Generate SQL Query" or press Enter
-    Note: The model works best when table context is provided, but can generate generic SQL queries without it.
-    """)
-# Launch the app
-demo.launch()

+from dotenv import load_dotenv
+import os
+from sentence_transformers import SentenceTransformer
 import gradio as gr
+from sklearn.metrics.pairwise import cosine_similarity
+from groq import Groq
+load_dotenv()
+api = os.getenv("groq_api_key")
+def create_metadata_embeddings():
+  student="""
+  Table: student
+  Columns:
+  - student_id: an integer representing the unique ID of a student.
+  - first_name: a string containing the first name of the student.
+  - last_name: a string containing the last name of the student.
+  - date_of_birth: a date representing the student's birthdate.
+  - email: a string for the student's email address.
+  - phone_number: a string for the student's contact number.
+  - major: a string representing the student's major field of study.
+  - year_of_enrollment: an integer for the year the student enrolled.
+  """
+  employee="""
+  Table: employee
+  Columns:
+  - employee_id: an integer representing the unique ID of an employee.
+  - first_name: a string containing the first name of the employee.
+  - last_name: a string containing the last name of the employee.
+  - email: a string for the employee's email address.
+  - department: a string for the department the employee works in.
+  - position: a string representing the employee's job title.
+  - salary: a float representing the employee's salary.
+  - date_of_joining: a date for when the employee joined the college.
+  """
+  course="""
+  Table: course_info
+  Columns:
+  - course_id: an integer representing the unique ID of the course.
+  - course_name: a string containing the course's name.
+  - course_code: a string for the course's unique code.
+  - instructor_id: an integer for the ID of the instructor teaching the course.
+  - department: a string for the department offering the course.
+  - credits: an integer representing the course credits.
+  - semester: a string for the semester when the course is offered.
+  """
+  metadata_list = [student, employee, course]
+  model = SentenceTransformer('all-MiniLM-L6-v2')
+  embeddings = model.encode(metadata_list)
+  return embeddings,model,student,employee,course
+def find_best_fit(embeddings,model,user_query,student,employee,course):
+  query_embedding = model.encode([user_query])
+  similarities = cosine_similarity(query_embedding, embeddings)
+  best_match_table = similarities.argmax()
+  if(best_match_table==0):
+    table_metadata=student
+  elif(best_match_table==1):
+    table_metadata=employee
+  else:
+    table_metadata=course
+  return table_metadata
+def create_prompt(user_query,table_metadata):
+  system_prompt="""
+  You are a SQL query generator specialized in generating SQL queries for a single table at a time. Your task is to accurately convert natural language queries into SQL statements based on the user's intent and the provided table metadata.
+  Rules:
+  Single Table Only: Assume all queries are related to a single table provided in the metadata. Ignore any references to other tables.
+  Metadata-Based Validation: Always ensure the generated query matches the table name, columns, and data types provided in the metadata.
+  User Intent: Accurately capture the user's requirements, such as filters, sorting, or aggregations, as expressed in natural language.
+  SQL Syntax: Use standard SQL syntax that is compatible with most relational database systems.
+  Input Format:
+  User Query: The user's natural language request.
+  Table Metadata: The structure of the relevant table, including the table name, column names, and data types.
+  Output Format:
+  SQL Query: A valid SQL query formatted for readability.
+  Do not output anything else except the SQL query.Not even a single word extra.Ouput the whole query in a single line only.
+  You are ready to generate SQL queries based on the user input and table metadata.
+  """
+  user_prompt=f"""
+  User Query: {user_query}
+  Table Metadata: {table_metadata}
+  """
+  return system_prompt,user_prompt
+def generate_output(system_prompt,user_prompt):
+  client = Groq(api_key=api,)
+  chat_completion = client.chat.completions.create(messages=[
+    {"role": "system", "content": system_prompt},
+     {"role": "user","content": user_prompt,}],model="llama3-70b-8192",)
+  res = chat_completion.choices[0].message.content
+  select=res[0:6].lower()
+  if(select=="select"):
+    output=res
+  else:
+    output="Can't perform the task at the moment."
+  return output
+def response(user_query):
+  embeddings,model,student,employee,course=create_metadata_embeddings()
+  table_metadata=find_best_fit(embeddings,model,user_query,student,employee,course)
+  system_prompt,user_prompt=create_prompt(user_query,table_metadata)
+  output=generate_output(system_prompt,user_prompt)
+  return output
+desc="""
+There are three tables in the database:
+Student Table:
+The table contains the student's unique ID, first name, last name, date of birth, email address, phone number, major field of study, and year of enrollment.
+Employee Table:
+The table includes the employee's unique ID, first name, last name, email address, department, job position, salary, and date of joining.
+Course Info Table:
+The table holds information about the course's unique ID, name, course code, instructor ID, department offering the course, number of credits, and the semester in which the course is offered.
+"""
+demo = gr.Interface(
+    fn=response,
+    inputs=gr.Textbox(label="Please provide the natural language query"),
+    outputs=gr.Textbox(label="SQL Query"),
+    title="SQL Query generator",
+    description=desc
 )
+demo.launch(share="True")