Spaces:

nisar9034
/

NL2SQL_ENGINE

Sleeping

App Files Files Community

nisar9034 commited on Apr 24

Commit

5e468f2

verified ·

1 Parent(s): 435f01f

Upload 5 files

Browse files

Files changed (5) hide show

app.py +97 -0
execution_checker.py +52 -0
few_shot_retriever.py +71 -0
requirements.txt +5 -3
schema_linker.py +76 -0

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import streamlit as st
+import json
+import re # <--- Added this to handle reading the text box
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+# Import the tools from the rest of the team
+from schema_linker import link_schema
+from few_shot_retriever import FewShotRetriever
+from execution_checker import get_best_query
+# --- ADDED: Teammate D's Regex Parser ---
+def parse_raw_sql_to_dict(raw_sql):
+    """Converts the CREATE TABLE box into a Python dictionary."""
+    schema_dict = {}
+    table_blocks = re.findall(r'CREATE TABLE\s+(\w+)\s*\((.*?)\);', raw_sql, re.IGNORECASE | re.DOTALL)
+    for table_name, columns_str in table_blocks:
+        cols = []
+        for col_def in columns_str.split(','):
+            col_def = col_def.strip()
+            if col_def:
+                col_name = col_def.split()[0]
+                cols.append(col_name)
+        schema_dict[table_name] = cols
+    return schema_dict
+# 1. LOAD THE HEAVY AI MODELS ONCE
+@st.cache_resource
+def load_ai_models():
+    # Load Teammate B's retriever
+    retriever = FewShotRetriever()
+    # --- CHANGED: Now using the pre-trained open-source model! ---
+    tokenizer = T5Tokenizer.from_pretrained("alpecevit/flan-t5-base-text2sql")
+    model = T5ForConditionalGeneration.from_pretrained("alpecevit/flan-t5-base-text2sql")
+    return retriever, tokenizer, model
+retriever, tokenizer, model = load_ai_models()
+# 2. BUILD THE WEBSITE DASHBOARD
+st.title("Natural Language to SQL Engine")
+st.write("Enter your database schema and question below.")
+# Text box for the user to paste their raw CREATE TABLE statements
+user_raw_schema = st.text_area(
+    "Paste your CREATE TABLE statements here:",
+    height=150,
+    placeholder="CREATE TABLE employees (id INTEGER, name TEXT);\nCREATE TABLE departments (id INTEGER, location TEXT);"
+)
+# Text box for the English question
+user_question = st.text_input("What do you want to know?", placeholder="e.g., Show me all employees in Chicago")
+# The big "Generate" button
+if st.button("Generate SQL"):
+    if user_raw_schema and user_question:
+        with st.spinner("Processing through the pipeline..."):
+            # --- CHANGED: Now dynamically reads whatever the user pastes! ---
+            schema_dict = parse_raw_sql_to_dict(user_raw_schema)
+            # 1. Teammate A tags the schema
+            tagged_schema = link_schema(user_question, schema_dict)
+            # 2. Teammate B gets the cheat sheet (we keep this for when your model is ready)
+            few_shot_examples = retriever.get_few_shot_prompt(user_question)
+            # 3. Teammate D glues it together for the Prompt
+            # FIX: We remove `few_shot_examples` from the prompt so we don't confuse the pre-trained model
+            final_prompt = f"Translate English to SQLite: {user_question} \nSchema Context: \n{tagged_schema}"
+            # 4. Generate 5 guesses using Beam Search
+            inputs = tokenizer(final_prompt, return_tensors="pt", max_length=1024, truncation=True)
+            outputs = model.generate(
+                **inputs,
+                max_length=256,
+                num_beams=5,
+                num_return_sequences=5
+            )
+            candidate_queries = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
+            # --- NEW DEBUG LINE ---
+            # This prints the AI's 5 guesses to the website so you can see if it's hallucinating!
+            st.warning(f"DEBUG - AI's raw guesses: {candidate_queries}")
+            candidate_queries = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
+            # 5. Teammate C acts as the firewall
+            winning_sql = get_best_query(user_raw_schema, candidate_queries)
+            # --- STEP C: DISPLAY THE RESULT ---
+            st.success("Query Generated Successfully!")
+            st.code(winning_sql, language="sql")
+    else:
+        st.error("Please provide both a schema and a question.")

execution_checker.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import sqlite3
+def get_best_query(schema_create_statements, candidate_queries):
+    """
+    Creates an in-memory database, builds the user's tables,
+    and tests the AI's generated queries to find the first valid one.
+    """
+    # 1. Create a temporary database in RAM (disappears when the function ends)
+    conn = sqlite3.connect(':memory:')
+    cursor = conn.cursor()
+    # 2. Build the empty tables using the user's schema
+    try:
+        cursor.executescript(schema_create_statements)
+    except sqlite3.OperationalError as e:
+        return f"Error: The provided schema is invalid. ({e})"
+    # 3. Test the AI's candidate queries
+    for i, query in enumerate(candidate_queries):
+        try:
+            # We use EXPLAIN to check syntax without actually querying data
+            cursor.execute(f"EXPLAIN {query}")
+            conn.close()
+            return f"-- Selected Candidate #{i+1} (Syntax Valid)\n{query}"
+        except sqlite3.OperationalError as e:
+            # If there is a syntax error, we ignore it and try the next candidate
+            print(f"Candidate {i+1} failed syntax check: {e}")
+            continue
+    conn.close()
+    return "Error: All generated queries contained syntax errors."
+# --- TESTING BLOCK ---
+if __name__ == "__main__":
+    # The frontend will provide the raw CREATE statements
+    test_schema = """
+    CREATE TABLE employees (id INTEGER PRIMARY KEY, name TEXT, salary REAL);
+    CREATE TABLE departments (id INTEGER PRIMARY KEY, name TEXT);
+    """
+    # The AI generates these. Notice the first two have deliberate syntax errors.
+    ai_candidates = [
+        "SELECT SUMM(salary) FROM employees",                     # Fails: Bad function name
+        "SELECT sum(salary) FROM employees JOIN bad_table",       # Fails: Table doesn't exist
+        "SELECT sum(salary) FROM employees"                       # Passes: Perfect SQLite syntax
+    ]
+    print("Testing AI Candidates against In-Memory DB...\n")
+    final_output = get_best_query(test_schema, ai_candidates)
+    print("\nWinning Query to show the user:\n" + final_output)

few_shot_retriever.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+class FewShotRetriever:
+    def __init__(self):
+        # 1. LOAD THE EMBEDDING MODEL
+        # This translates English words into mathematical vectors
+        print("Loading BGE Model (This might take a minute the first time)...")
+        self.model = SentenceTransformer("BAAI/bge-base-en-v1.5")
+        # 2. LOAD THE HISTORICAL DATA (The Answer Bank)
+        # In the final version, you can load a JSON file here.
+        # For now, we use a hardcoded list so you can test it immediately.
+        self.historical_data = [
+            {"q": "What is the average salary of IT staff?", "sql": "SELECT avg(salary) FROM staff WHERE dept = 'IT'"},
+            {"q": "Count the number of patients in the ICU.", "sql": "SELECT count(*) FROM patients WHERE ward = 'ICU'"},
+            {"q": "Show me the total budget for the marketing department.", "sql": "SELECT sum(budget) FROM departments WHERE name = 'Marketing'"},
+            {"q": "Find the average age of all employees.", "sql": "SELECT avg(age) FROM employees"},
+            {"q": "How many marketing staff earn more than 50000?", "sql": "SELECT count(*) FROM staff WHERE dept = 'Marketing' AND salary > 50000"}
+        ]
+        # 3. PRE-CALCULATE THE VECTORS
+        # We translate all the historical questions into math ONCE when the script starts
+        historical_questions = [item["q"] for item in self.historical_data]
+        self.historical_embs = self.model.encode(historical_questions, normalize_embeddings=True)
+    def get_few_shot_prompt(self, user_query, top_k=2):
+        """
+        Takes the user's new question, finds the 'top_k' most similar past questions,
+        and formats them into a text block.
+        """
+        # 1. Translate the NEW question into a math vector
+        q_emb = self.model.encode([user_query], normalize_embeddings=True)
+        # 2. Calculate the Cosine Similarity (the mathematical overlap)
+        # between the new question and all the past questions
+        scores = cosine_similarity(q_emb, self.historical_embs)[0]
+        # 3. Get the index positions of the highest scoring questions
+        # argsort() sorts lowest to highest, so we grab from the end [-top_k:] and reverse it [::-1]
+        top_indices = scores.argsort()[-top_k:][::-1]
+        # 4. Format the output string for the T5 model
+        prompt_prefix = "Here are some examples of translating English to SQL:\n\n"
+        for idx in top_indices:
+            past_example = self.historical_data[idx]
+            prompt_prefix += f"Example Question: {past_example['q']}\n"
+            prompt_prefix += f"Example SQL: {past_example['sql']}\n\n"
+        return prompt_prefix
+# --- TESTING BLOCK ---
+if __name__ == "__main__":
+    # Initialize the class (this loads the model)
+    retriever = FewShotRetriever()
+    # Simulate a user asking a brand new question
+    new_query = "What is the average salary of the sales team?"
+    print("\n--- INPUT ---")
+    print(f"New User Question: {new_query}")
+    print("\n--- YOUR OUTPUT (The Cheat Sheet) ---")
+    # Fetch the top 2 most similar examples
+    final_result = retriever.get_few_shot_prompt(new_query, top_k=2)
+    print(final_result)

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
-altair
-pandas
-streamlit

+streamlit
+transformers
+torch
+sentence-transformers
+scikit-learn

schema_linker.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import string
+def link_schema(user_query, raw_schema):
+    """
+    Scans the user's question and tags database columns that match exactly.
+    """
+    # 1. CLEAN THE QUERY
+    # Convert to lowercase: "Show me the Budget!" -> "show me the budget!"
+    query_lower = user_query.lower()
+    # Remove punctuation using Python's string library
+    # "show me the budget!" -> "show me the budget"
+    for punctuation_mark in string.punctuation:
+        query_lower = query_lower.replace(punctuation_mark, "")
+    # Split the clean sentence into an array of individual words
+    # ["show", "me", "the", "budget"]
+    query_words = set(query_lower.split())
+    # 2. PREPARE THE OUTPUT STORAGE
+    # This array will hold the final, formatted strings for each table
+    linked_schema_lines = []
+    # 3. ITERATE THROUGH THE SCHEMA
+    # raw_schema is a dictionary where the key is the table name,
+    # and the value is a list of column names.
+    for table_name, column_list in raw_schema.items():
+        tagged_columns = []
+        for col in column_list:
+            # We convert the column to lowercase just in case
+            col_clean = col.lower()
+            # 4. THE MATCHING LOGIC
+            # If the exact column name exists in the array of user words
+            if col_clean in query_words:
+                # Append the tag so the AI knows this is important
+                tagged_columns.append(f'{col} (Exact Match: "{col}")')
+            else:
+                # Otherwise, just keep the column name as normal
+                tagged_columns.append(col)
+        # 5. FORMAT THE FINAL STRING
+        # Glue the tagged columns together with commas
+        formatted_cols = ", ".join(tagged_columns)
+        # Build the final string for this specific table
+        table_string = f"Table: {table_name} | Cols: {formatted_cols}"
+        # Add it to our output storage
+        linked_schema_lines.append(table_string)
+    # 6. RETURN THE RESULT
+    # Join all the individual table strings together with line breaks
+    return " \n".join(linked_schema_lines)
+# --- TESTING BLOCK ---
+if __name__ == "__main__":
+    # Simulate what the frontend UI will hand to your function
+    test_question = "What is the location and budget for the marketing department?"
+    # Simulate a messy database schema
+    test_schema = {
+        "employees": ["id", "name", "department_id", "salary"],
+        "departments": ["id", "name", "location", "budget", "industry"]
+    }
+    print("--- INPUTS ---")
+    print(f"Question: {test_question}")
+    print("\n--- YOUR OUTPUT ---")
+    final_result = link_schema(test_question, test_schema)
+    print(final_result)