Spaces:

abir-hr196
/

tinysql-demo

Sleeping

App Files Files Community

abir-hr196 commited on Oct 9

Commit

38a8f52

1 Parent(s): 5f2b44a

Initial commit

Browse files

Files changed (3) hide show

README.md +23 -7
app.py +174 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,13 +1,29 @@
 ---
-title: Tinysql Demo
-emoji: 🐠
-colorFrom: green
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TinySQL Demo
+emoji: 🔍
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.0.0
 app_file: app.py
 pinned: false
 ---
+# TinySQL: Text-to-SQL Generation Demo
+Generate SQL queries from natural language using models trained on the TinySQL dataset.
+## Features
+- **20 models** to choose from (33M to 1B parameters)
+- **Multiple datasets** (CS1, CS2, CS3 with base and synonym variants)
+- **Interactive interface** with example queries
+## Paper
+[TinySQL: A Progressive Text-to-SQL Dataset for Mechanistic Interpretability Research](https://arxiv.org/abs/2503.12730)
+## Resources
+- [GitHub Repository](https://github.com/withmartian/TinySQL)
+- [Dataset & Models](https://huggingface.co/collections/withmartian/tinysql-6760e92748b63fa56a6ffc9f)

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+# Model configurations
+MODELS = {
+    "BM1_CS1_Syn (33M)": "withmartian/sql_interp_bm1_cs1_experiment_1.10",
+    "BM1_CS2_Syn (33M)": "withmartian/sql_interp_bm1_cs2_experiment_2.10",
+    "BM1_CS3_Syn (33M)": "withmartian/sql_interp_bm1_cs3_experiment_3.10",
+    "BM1_CS4_Syn (33M)": "withmartian/sql_interp_bm1_cs4_dataset_synonyms_experiment_1.1",
+    "BM1_CS5_Syn (33M)": "withmartian/sql_interp_bm1_cs5_dataset_synonyms_experiment_1.2",
+    "BM2_CS1_Syn (0.5B)": "withmartian/sql_interp_bm2_cs1_experiment_4.3",
+    "BM2_CS2_Syn (0.5B)": "withmartian/sql_interp_bm2_cs2_experiment_5.3",
+    "BM2_CS3_Syn (0.5B)": "withmartian/sql_interp_bm2_cs3_experiment_6.3",
+    "BM3_CS1_Syn (1B)": "withmartian/sql_interp_bm3_cs1_experiment_7.3",
+    "BM3_CS2_Syn (1B)": "withmartian/sql_interp_bm3_cs2_experiment_8.3",
+    "BM3_CS3_Syn (1B)": "withmartian/sql_interp_bm3_cs3_experiment_9.3",
+}
+# Cache loaded models
+model_cache = {}
+def load_model(model_name):
+    """Load model and tokenizer with caching"""
+    if model_name not in model_cache:
+        model_id = MODELS[model_name]
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        model_cache[model_name] = (tokenizer, model)
+    return model_cache[model_name]
+def generate_sql(model_name, instruction, schema, max_length=256, temperature=0.7):
+    """Generate SQL query from natural language"""
+    try:
+        tokenizer, model = load_model(model_name)
+        # Format prompt
+        prompt = f"""### Instruction: {instruction}
+### Context: {schema}
+### Response:"""
+        # Tokenize
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # Generate
+        outputs = model.generate(
+            **inputs,
+            max_length=max_length,
+            temperature=temperature,
+            do_sample=temperature > 0,
+            pad_token_id=tokenizer.eos_token_id
+        )
+        # Decode
+        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the SQL response
+        if "### Response:" in generated:
+            sql = generated.split("### Response:")[-1].strip()
+        else:
+            sql = generated.strip()
+        return sql
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Example queries
+examples = [
+    [
+        "BM1_CS1 (33M)",
+        "Show me the name and salary from employees",
+        "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
+    ],
+    [
+        "BM2_CS2_Syn (0.5B)",
+        "List worker earnings from highest to lowest",
+        "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
+    ],
+    [
+        "BM3_CS3 (1B)",
+        "Count how many employees in each department",
+        "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
+    ],
+]
+# Create Gradio interface
+with gr.Blocks(title="TinySQL Demo") as demo:
+    gr.Markdown("""
+    # 🔍 TinySQL: Text-to-SQL Generation Demo
+    Generate SQL queries from natural language using models trained on TinySQL.
+    Select a model, provide a natural language instruction and database schema, then click **Generate**.
+    **Model Types:**
+    - **BM1** (33M params): TinyStories-based, fastest
+    - **BM2** (0.5B params): Qwen2.5-based, balanced
+    - **BM3** (1B params): Llama-3.2-based, most accurate
+    - **Syn** variants: Trained on synonym dataset (handles semantic mappings)
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            model_dropdown = gr.Dropdown(
+                choices=list(MODELS.keys()),
+                value="BM2_CS1_Syn (0.5B)",
+                label="Select Model",
+                info="Choose model size and training dataset"
+            )
+            instruction = gr.Textbox(
+                label="Natural Language Query",
+                placeholder="e.g., Show me all employees with salary greater than 50000",
+                lines=2
+            )
+            schema = gr.Textbox(
+                label="Database Schema",
+                placeholder="CREATE TABLE employees (name VARCHAR, salary INT, department VARCHAR)",
+                lines=3,
+                value="CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
+            )
+            with gr.Row():
+                max_length = gr.Slider(
+                    minimum=64,
+                    maximum=512,
+                    value=256,
+                    step=32,
+                    label="Max Length"
+                )
+                temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.1,
+                    step=0.1,
+                    label="Temperature"
+                )
+            generate_btn = gr.Button("Generate SQL", variant="primary")
+        with gr.Column(scale=1):
+            output = gr.Textbox(
+                label="Generated SQL",
+                lines=10,
+                placeholder="SQL query will appear here..."
+            )
+    gr.Markdown("### Example Queries")
+    gr.Examples(
+        examples=examples,
+        inputs=[model_dropdown, instruction, schema],
+    )
+    gr.Markdown("""
+    ---
+    **Paper:** [TinySQL: A Progressive Text-to-SQL Dataset for Mechanistic Interpretability Research](https://arxiv.org/abs/2503.12730)
+    **Resources:** [GitHub](https://github.com/withmartian/TinySQL) | [Dataset](https://huggingface.co/collections/withmartian/tinysql-6760e92748b63fa56a6ffc9f)
+    """)
+    # Connect button
+    generate_btn.click(
+        fn=generate_sql,
+        inputs=[model_dropdown, instruction, schema, max_length, temperature],
+        outputs=output
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+transformers>=4.30.0
+torch>=2.0.0
+accelerate>=0.20.0