NL2SQL_ENGINE / schema_linker.py
nisar9034's picture
Upload 5 files
5e468f2 verified
import string
def link_schema(user_query, raw_schema):
"""
Scans the user's question and tags database columns that match exactly.
"""
# 1. CLEAN THE QUERY
# Convert to lowercase: "Show me the Budget!" -> "show me the budget!"
query_lower = user_query.lower()
# Remove punctuation using Python's string library
# "show me the budget!" -> "show me the budget"
for punctuation_mark in string.punctuation:
query_lower = query_lower.replace(punctuation_mark, "")
# Split the clean sentence into an array of individual words
# ["show", "me", "the", "budget"]
query_words = set(query_lower.split())
# 2. PREPARE THE OUTPUT STORAGE
# This array will hold the final, formatted strings for each table
linked_schema_lines = []
# 3. ITERATE THROUGH THE SCHEMA
# raw_schema is a dictionary where the key is the table name,
# and the value is a list of column names.
for table_name, column_list in raw_schema.items():
tagged_columns = []
for col in column_list:
# We convert the column to lowercase just in case
col_clean = col.lower()
# 4. THE MATCHING LOGIC
# If the exact column name exists in the array of user words
if col_clean in query_words:
# Append the tag so the AI knows this is important
tagged_columns.append(f'{col} (Exact Match: "{col}")')
else:
# Otherwise, just keep the column name as normal
tagged_columns.append(col)
# 5. FORMAT THE FINAL STRING
# Glue the tagged columns together with commas
formatted_cols = ", ".join(tagged_columns)
# Build the final string for this specific table
table_string = f"Table: {table_name} | Cols: {formatted_cols}"
# Add it to our output storage
linked_schema_lines.append(table_string)
# 6. RETURN THE RESULT
# Join all the individual table strings together with line breaks
return " \n".join(linked_schema_lines)
# --- TESTING BLOCK ---
if __name__ == "__main__":
# Simulate what the frontend UI will hand to your function
test_question = "What is the location and budget for the marketing department?"
# Simulate a messy database schema
test_schema = {
"employees": ["id", "name", "department_id", "salary"],
"departments": ["id", "name", "location", "budget", "industry"]
}
print("--- INPUTS ---")
print(f"Question: {test_question}")
print("\n--- YOUR OUTPUT ---")
final_result = link_schema(test_question, test_schema)
print(final_result)