Spaces:

abir-hr196
/

tinysql-demo

Sleeping

File size: 6,121 Bytes

38a8f52

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Model configurations
MODELS = {
    "BM1_CS1_Syn (33M)": "withmartian/sql_interp_bm1_cs1_experiment_1.10",
    "BM1_CS2_Syn (33M)": "withmartian/sql_interp_bm1_cs2_experiment_2.10",
    "BM1_CS3_Syn (33M)": "withmartian/sql_interp_bm1_cs3_experiment_3.10",
    "BM1_CS4_Syn (33M)": "withmartian/sql_interp_bm1_cs4_dataset_synonyms_experiment_1.1",
    "BM1_CS5_Syn (33M)": "withmartian/sql_interp_bm1_cs5_dataset_synonyms_experiment_1.2",
    "BM2_CS1_Syn (0.5B)": "withmartian/sql_interp_bm2_cs1_experiment_4.3",
    "BM2_CS2_Syn (0.5B)": "withmartian/sql_interp_bm2_cs2_experiment_5.3",
    "BM2_CS3_Syn (0.5B)": "withmartian/sql_interp_bm2_cs3_experiment_6.3",
    "BM3_CS1_Syn (1B)": "withmartian/sql_interp_bm3_cs1_experiment_7.3",
    "BM3_CS2_Syn (1B)": "withmartian/sql_interp_bm3_cs2_experiment_8.3",
    "BM3_CS3_Syn (1B)": "withmartian/sql_interp_bm3_cs3_experiment_9.3",
}

# Cache loaded models
model_cache = {}

def load_model(model_name):
    """Load model and tokenizer with caching"""
    if model_name not in model_cache:
        model_id = MODELS[model_name]
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        model_cache[model_name] = (tokenizer, model)
    return model_cache[model_name]

def generate_sql(model_name, instruction, schema, max_length=256, temperature=0.7):
    """Generate SQL query from natural language"""
    try:
        tokenizer, model = load_model(model_name)
        
        # Format prompt
        prompt = f"""### Instruction: {instruction}
### Context: {schema}
### Response:"""
        
        # Tokenize
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        # Generate
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            do_sample=temperature > 0,
            pad_token_id=tokenizer.eos_token_id
        )
        
        # Decode
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the SQL response
        if "### Response:" in generated:
            sql = generated.split("### Response:")[-1].strip()
        else:
            sql = generated.strip()
            
        return sql
        
    except Exception as e:
        return f"Error: {str(e)}"

# Example queries
examples = [
    [
        "BM1_CS1 (33M)",
        "Show me the name and salary from employees",
        "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
    ],
    [
        "BM2_CS2_Syn (0.5B)",
        "List worker earnings from highest to lowest",
        "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
    ],
    [
        "BM3_CS3 (1B)",
        "Count how many employees in each department",
        "CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
    ],
]

# Create Gradio interface
with gr.Blocks(title="TinySQL Demo") as demo:
    gr.Markdown("""
    # 🔍 TinySQL: Text-to-SQL Generation Demo
    
    Generate SQL queries from natural language using models trained on TinySQL.
    Select a model, provide a natural language instruction and database schema, then click **Generate**.
    
    **Model Types:**
    - **BM1** (33M params): TinyStories-based, fastest
    - **BM2** (0.5B params): Qwen2.5-based, balanced
    - **BM3** (1B params): Llama-3.2-based, most accurate
    - **Syn** variants: Trained on synonym dataset (handles semantic mappings)
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            model_dropdown = gr.Dropdown(
                choices=list(MODELS.keys()),
                value="BM2_CS1_Syn (0.5B)",
                label="Select Model",
                info="Choose model size and training dataset"
            )
            
            instruction = gr.Textbox(
                label="Natural Language Query",
                placeholder="e.g., Show me all employees with salary greater than 50000",
                lines=2
            )
            
            schema = gr.Textbox(
                label="Database Schema",
                placeholder="CREATE TABLE employees (name VARCHAR, salary INT, department VARCHAR)",
                lines=3,
                value="CREATE TABLE employees (name VARCHAR(100), salary INT, department VARCHAR(100))"
            )
            
            with gr.Row():
                max_length = gr.Slider(
                    minimum=64,
                    maximum=512,
                    value=256,
                    step=32,
                    label="Max Length"
                )
                temperature = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.1,
                    step=0.1,
                    label="Temperature"
                )
            
            generate_btn = gr.Button("Generate SQL", variant="primary")
        
        with gr.Column(scale=1):
            output = gr.Textbox(
                label="Generated SQL",
                lines=10,
                placeholder="SQL query will appear here..."
            )
    
    gr.Markdown("### Example Queries")
    gr.Examples(
        examples=examples,
        inputs=[model_dropdown, instruction, schema],
    )
    
    gr.Markdown("""
    ---
    **Paper:** [TinySQL: A Progressive Text-to-SQL Dataset for Mechanistic Interpretability Research](https://arxiv.org/abs/2503.12730)
    
    **Resources:** [GitHub](https://github.com/withmartian/TinySQL) | [Dataset](https://huggingface.co/collections/withmartian/tinysql-6760e92748b63fa56a6ffc9f)
    """)
    
    # Connect button
    generate_btn.click(
        fn=generate_sql,
        inputs=[model_dropdown, instruction, schema, max_length, temperature],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()