abir-hr196 commited on
Commit
38a8f52
Β·
1 Parent(s): 5f2b44a

Initial commit

Browse files
Files changed (3) hide show
  1. README.md +23 -7
  2. app.py +174 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,13 +1,29 @@
1
  ---
2
- title: Tinysql Demo
3
- emoji: 🐠
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: TinySQL Demo
3
+ emoji: πŸ”
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # TinySQL: Text-to-SQL Generation Demo
13
+
14
+ Generate SQL queries from natural language using models trained on the TinySQL dataset.
15
+
16
+ ## Features
17
+
18
+ - **20 models** to choose from (33M to 1B parameters)
19
+ - **Multiple datasets** (CS1, CS2, CS3 with base and synonym variants)
20
+ - **Interactive interface** with example queries
21
+
22
+ ## Paper
23
+
24
+ [TinySQL: A Progressive Text-to-SQL Dataset for Mechanistic Interpretability Research](https://arxiv.org/abs/2503.12730)
25
+
26
+ ## Resources
27
+
28
+ - [GitHub Repository](https://github.com/withmartian/TinySQL)
29
+ - [Dataset & Models](https://huggingface.co/collections/withmartian/tinysql-6760e92748b63fa56a6ffc9f)
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import torch
4
+
5
+ # Model configurations
6
+ MODELS = {
7
+ "BM1_CS1_Syn (33M)": "withmartian/sql_interp_bm1_cs1_experiment_1.10",
8
+ "BM1_CS2_Syn (33M)": "withmartian/sql_interp_bm1_cs2_experiment_2.10",
9
+ "BM1_CS3_Syn (33M)": "withmartian/sql_interp_bm1_cs3_experiment_3.10",
10
+ "BM1_CS4_Syn (33M)": "withmartian/sql_interp_bm1_cs4_dataset_synonyms_experiment_1.1",
11
+ "BM1_CS5_Syn (33M)": "withmartian/sql_interp_bm1_cs5_dataset_synonyms_experiment_1.2",
12
+ "BM2_CS1_Syn (0.5B)": "withmartian/sql_interp_bm2_cs1_experiment_4.3",
13
+ "BM2_CS2_Syn (0.5B)": "withmartian/sql_interp_bm2_cs2_experiment_5.3",
14
+ "BM2_CS3_Syn (0.5B)": "withmartian/sql_interp_bm2_cs3_experiment_6.3",
15
+ "BM3_CS1_Syn (1B)": "withmartian/sql_interp_bm3_cs1_experiment_7.3",
16
+ "BM3_CS2_Syn (1B)": "withmartian/sql_interp_bm3_cs2_experiment_8.3",
17
+ "BM3_CS3_Syn (1B)": "withmartian/sql_interp_bm3_cs3_experiment_9.3",
18
+ }
19
+
20
+ # Cache loaded models
21
+ model_cache = {}
22
+
23
+ def load_model(model_name):
24
+ """Load model and tokenizer with caching"""
25
+ if model_name not in model_cache:
26
+ model_id = MODELS[model_name]
27
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ model_id,
30
+ torch_dtype=torch.float16,
31
+ device_map="auto"
32
+ )
33
+ model_cache[model_name] = (tokenizer, model)
34
+ return model_cache[model_name]
35
+
36
+ def generate_sql(model_name, instruction, schema, max_length=256, temperature=0.7):
37
+ """Generate SQL query from natural language"""
38
+ try:
39
+ tokenizer, model = load_model(model_name)
40
+
41
+ # Format prompt
42
+ prompt = f"""### Instruction: {instruction}
43
+ ### Context: {schema}
44
+ ### Response:"""
45
+
46
+ # Tokenize
47
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
48
+
49
+ # Generate
50
+ outputs = model.generate(
51
+ **inputs,
52
+ max_length=max_length,
53
+ temperature=temperature,
54
+ do_sample=temperature > 0,
55
+ pad_token_id=tokenizer.eos_token_id
56
+ )
57
+
58
+ # Decode
59
+ generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
60
+
61
+ # Extract only the SQL response
62
+ if "### Response:" in generated:
63
+ sql = generated.split("### Response:")[-1].strip()
64
+ else:
65
+ sql = generated.strip()
66
+
67
+ return sql
68
+
69
+ except Exception as e:
70
+ return f"Error: {str(e)}"
71
+
72
+ # Example queries
73
+ examples = [
74
+ [
75
+ "BM1_CS1 (33M)",
76
+ "Show me the name and salary from employees",
77
+ "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
78
+ ],
79
+ [
80
+ "BM2_CS2_Syn (0.5B)",
81
+ "List worker earnings from highest to lowest",
82
+ "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
83
+ ],
84
+ [
85
+ "BM3_CS3 (1B)",
86
+ "Count how many employees in each department",
87
+ "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
88
+ ],
89
+ ]
90
+
91
+ # Create Gradio interface
92
+ with gr.Blocks(title="TinySQL Demo") as demo:
93
+ gr.Markdown("""
94
+ # πŸ” TinySQL: Text-to-SQL Generation Demo
95
+
96
+ Generate SQL queries from natural language using models trained on TinySQL.
97
+ Select a model, provide a natural language instruction and database schema, then click **Generate**.
98
+
99
+ **Model Types:**
100
+ - **BM1** (33M params): TinyStories-based, fastest
101
+ - **BM2** (0.5B params): Qwen2.5-based, balanced
102
+ - **BM3** (1B params): Llama-3.2-based, most accurate
103
+ - **Syn** variants: Trained on synonym dataset (handles semantic mappings)
104
+ """)
105
+
106
+ with gr.Row():
107
+ with gr.Column(scale=2):
108
+ model_dropdown = gr.Dropdown(
109
+ choices=list(MODELS.keys()),
110
+ value="BM2_CS1_Syn (0.5B)",
111
+ label="Select Model",
112
+ info="Choose model size and training dataset"
113
+ )
114
+
115
+ instruction = gr.Textbox(
116
+ label="Natural Language Query",
117
+ placeholder="e.g., Show me all employees with salary greater than 50000",
118
+ lines=2
119
+ )
120
+
121
+ schema = gr.Textbox(
122
+ label="Database Schema",
123
+ placeholder="CREATE TABLE employees (name VARCHAR, salary INT, department VARCHAR)",
124
+ lines=3,
125
+ value="CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
126
+ )
127
+
128
+ with gr.Row():
129
+ max_length = gr.Slider(
130
+ minimum=64,
131
+ maximum=512,
132
+ value=256,
133
+ step=32,
134
+ label="Max Length"
135
+ )
136
+ temperature = gr.Slider(
137
+ minimum=0.0,
138
+ maximum=1.0,
139
+ value=0.1,
140
+ step=0.1,
141
+ label="Temperature"
142
+ )
143
+
144
+ generate_btn = gr.Button("Generate SQL", variant="primary")
145
+
146
+ with gr.Column(scale=1):
147
+ output = gr.Textbox(
148
+ label="Generated SQL",
149
+ lines=10,
150
+ placeholder="SQL query will appear here..."
151
+ )
152
+
153
+ gr.Markdown("### Example Queries")
154
+ gr.Examples(
155
+ examples=examples,
156
+ inputs=[model_dropdown, instruction, schema],
157
+ )
158
+
159
+ gr.Markdown("""
160
+ ---
161
+ **Paper:** [TinySQL: A Progressive Text-to-SQL Dataset for Mechanistic Interpretability Research](https://arxiv.org/abs/2503.12730)
162
+
163
+ **Resources:** [GitHub](https://github.com/withmartian/TinySQL) | [Dataset](https://huggingface.co/collections/withmartian/tinysql-6760e92748b63fa56a6ffc9f)
164
+ """)
165
+
166
+ # Connect button
167
+ generate_btn.click(
168
+ fn=generate_sql,
169
+ inputs=[model_dropdown, instruction, schema, max_length, temperature],
170
+ outputs=output
171
+ )
172
+
173
+ if __name__ == "__main__":
174
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.30.0
3
+ torch>=2.0.0
4
+ accelerate>=0.20.0