Spaces:

abir-hr196
/

tinysql-demo

Sleeping

App Files Files Community

tinysql-demo / app.py

abir-hr196

Initial commit

38a8f52 2 months ago

raw

history blame

6.12 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch

	# Model configurations
	MODELS = {
	"BM1_CS1_Syn (33M)": "withmartian/sql_interp_bm1_cs1_experiment_1.10",
	"BM1_CS2_Syn (33M)": "withmartian/sql_interp_bm1_cs2_experiment_2.10",
	"BM1_CS3_Syn (33M)": "withmartian/sql_interp_bm1_cs3_experiment_3.10",
	"BM1_CS4_Syn (33M)": "withmartian/sql_interp_bm1_cs4_dataset_synonyms_experiment_1.1",
	"BM1_CS5_Syn (33M)": "withmartian/sql_interp_bm1_cs5_dataset_synonyms_experiment_1.2",
	"BM2_CS1_Syn (0.5B)": "withmartian/sql_interp_bm2_cs1_experiment_4.3",
	"BM2_CS2_Syn (0.5B)": "withmartian/sql_interp_bm2_cs2_experiment_5.3",
	"BM2_CS3_Syn (0.5B)": "withmartian/sql_interp_bm2_cs3_experiment_6.3",
	"BM3_CS1_Syn (1B)": "withmartian/sql_interp_bm3_cs1_experiment_7.3",
	"BM3_CS2_Syn (1B)": "withmartian/sql_interp_bm3_cs2_experiment_8.3",
	"BM3_CS3_Syn (1B)": "withmartian/sql_interp_bm3_cs3_experiment_9.3",
	}

	# Cache loaded models
	model_cache = {}

	def load_model(model_name):
	"""Load model and tokenizer with caching"""
	if model_name not in model_cache:
	model_id = MODELS[model_name]
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto"
	)
	model_cache[model_name] = (tokenizer, model)
	return model_cache[model_name]

	def generate_sql(model_name, instruction, schema, max_length=256, temperature=0.7):
	"""Generate SQL query from natural language"""
	try:
	tokenizer, model = load_model(model_name)

	# Format prompt
	prompt = f"""### Instruction: {instruction}
	### Context: {schema}
	### Response:"""

	# Tokenize
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	# Generate
	outputs = model.generate(
	**inputs,
	max_length=max_length,
	temperature=temperature,
	do_sample=temperature > 0,
	pad_token_id=tokenizer.eos_token_id
	)

	# Decode
	generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract only the SQL response
	if "### Response:" in generated:
	sql = generated.split("### Response:")[-1].strip()
	else:
	sql = generated.strip()

	return sql

	except Exception as e:
	return f"Error: {str(e)}"

	# Example queries
	examples = [
	[
	"BM1_CS1 (33M)",
	"Show me the name and salary from employees",
	"CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
	],
	[
	"BM2_CS2_Syn (0.5B)",
	"List worker earnings from highest to lowest",
	"CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
	],
	[
	"BM3_CS3 (1B)",
	"Count how many employees in each department",
	"CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
	],
	]

	# Create Gradio interface
	with gr.Blocks(title="TinySQL Demo") as demo:
	gr.Markdown("""
	# 🔍 TinySQL: Text-to-SQL Generation Demo

	Generate SQL queries from natural language using models trained on TinySQL.
	Select a model, provide a natural language instruction and database schema, then click Generate.

	Model Types:
	- BM1 (33M params): TinyStories-based, fastest
	- BM2 (0.5B params): Qwen2.5-based, balanced
	- BM3 (1B params): Llama-3.2-based, most accurate
	- Syn variants: Trained on synonym dataset (handles semantic mappings)
	""")

	with gr.Row():
	with gr.Column(scale=2):
	model_dropdown = gr.Dropdown(
	choices=list(MODELS.keys()),
	value="BM2_CS1_Syn (0.5B)",
	label="Select Model",
	info="Choose model size and training dataset"
	)

	instruction = gr.Textbox(
	label="Natural Language Query",
	placeholder="e.g., Show me all employees with salary greater than 50000",
	lines=2
	)

	schema = gr.Textbox(
	label="Database Schema",
	placeholder="CREATE TABLE employees (name VARCHAR, salary INT, department VARCHAR)",
	lines=3,
	value="CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
	)

	with gr.Row():
	max_length = gr.Slider(
	minimum=64,
	maximum=512,
	value=256,
	step=32,
	label="Max Length"
	)
	temperature = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.1,
	step=0.1,
	label="Temperature"
	)

	generate_btn = gr.Button("Generate SQL", variant="primary")

	with gr.Column(scale=1):
	output = gr.Textbox(
	label="Generated SQL",
	lines=10,
	placeholder="SQL query will appear here..."
	)

	gr.Markdown("### Example Queries")
	gr.Examples(
	examples=examples,
	inputs=[model_dropdown, instruction, schema],
	)

	gr.Markdown("""
	---
	Paper: [TinySQL: A Progressive Text-to-SQL Dataset for Mechanistic Interpretability Research](https://arxiv.org/abs/2503.12730)

	Resources: [GitHub](https://github.com/withmartian/TinySQL) \| [Dataset](https://huggingface.co/collections/withmartian/tinysql-6760e92748b63fa56a6ffc9f)
	""")

	# Connect button
	generate_btn.click(
	fn=generate_sql,
	inputs=[model_dropdown, instruction, schema, max_length, temperature],
	outputs=output
	)

	if __name__ == "__main__":
	demo.launch()