import string def link_schema(user_query, raw_schema): """ Scans the user's question and tags database columns that match exactly. """ # 1. CLEAN THE QUERY # Convert to lowercase: "Show me the Budget!" -> "show me the budget!" query_lower = user_query.lower() # Remove punctuation using Python's string library # "show me the budget!" -> "show me the budget" for punctuation_mark in string.punctuation: query_lower = query_lower.replace(punctuation_mark, "") # Split the clean sentence into an array of individual words # ["show", "me", "the", "budget"] query_words = set(query_lower.split()) # 2. PREPARE THE OUTPUT STORAGE # This array will hold the final, formatted strings for each table linked_schema_lines = [] # 3. ITERATE THROUGH THE SCHEMA # raw_schema is a dictionary where the key is the table name, # and the value is a list of column names. for table_name, column_list in raw_schema.items(): tagged_columns = [] for col in column_list: # We convert the column to lowercase just in case col_clean = col.lower() # 4. THE MATCHING LOGIC # If the exact column name exists in the array of user words if col_clean in query_words: # Append the tag so the AI knows this is important tagged_columns.append(f'{col} (Exact Match: "{col}")') else: # Otherwise, just keep the column name as normal tagged_columns.append(col) # 5. FORMAT THE FINAL STRING # Glue the tagged columns together with commas formatted_cols = ", ".join(tagged_columns) # Build the final string for this specific table table_string = f"Table: {table_name} | Cols: {formatted_cols}" # Add it to our output storage linked_schema_lines.append(table_string) # 6. RETURN THE RESULT # Join all the individual table strings together with line breaks return " \n".join(linked_schema_lines) # --- TESTING BLOCK --- if __name__ == "__main__": # Simulate what the frontend UI will hand to your function test_question = "What is the location and budget for the marketing department?" # Simulate a messy database schema test_schema = { "employees": ["id", "name", "department_id", "salary"], "departments": ["id", "name", "location", "budget", "industry"] } print("--- INPUTS ---") print(f"Question: {test_question}") print("\n--- YOUR OUTPUT ---") final_result = link_schema(test_question, test_schema) print(final_result)