import gradio as gr from datasets import load_dataset import pandas as pd DATASETS = { "CS1": "withmartian/cs1_dataset", "CS2": "withmartian/cs2_dataset", "CS3": "withmartian/cs3_dataset", "CS2 Synonyms": "withmartian/cs2_dataset_synonyms", "CS3 Synonyms": "withmartian/cs3_dataset_synonyms", "CS4 Synonyms": "withmartian/cs4_dataset_synonyms", } COLUMNS = ["create_statement", "english_prompt", "sql_statement"] def load_preview(dataset_name): try: ds = load_dataset(DATASETS[dataset_name], split="train") df = pd.DataFrame(ds).head(500) if all(col in df.columns for col in COLUMNS): df = df[COLUMNS] # Add index column df.insert(0, 'index', range(len(df))) return df except Exception as e: return pd.DataFrame({"Error": [str(e)]}) def filter_dataframe(df, search_query): if not search_query or df.empty or "Error" in df.columns: return df mask = df.astype(str).apply( lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1 ) return df[mask] def dataset_viewer(shared_instruction, shared_schema): gr.HTML("""

Dataset Explorer

Browse, search, and explore TinySQL datasets

""") gr.HTML("""

Quick Start: Select a dataset, click Load Dataset, then use search to filter. Pick any row and send it to the Model Demo tab.

""") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Controls") dataset_dropdown = gr.Dropdown( choices=list(DATASETS.keys()), value="CS1", label="Choose Dataset", info="Select complexity level" ) # Simpler dataset guide - no colors, no beginner/intermediate gr.HTML("""

Dataset Complexity Levels

CS1: Basic SELECT-FROM queries
CS2: Adds ORDER BY clauses
CS3: Aggregations (COUNT, SUM, AVG)
CS4: Adds WHERE filters
CS5: Multi-table JOINs
Synonym Variants
Natural language variations with semantic mappings
""") load_btn = gr.Button("Load Dataset", variant="primary", size="lg") gr.Markdown("### Test Example") row_selector = gr.Number( label="Row Number", value=0, minimum=0, precision=0, info="Pick a row to test" ) send_to_model_btn = gr.Button("Run in Model Demo", variant="primary") with gr.Column(scale=3): gr.Markdown("### Dataset Preview") search_box = gr.Textbox( label="Search", placeholder="Search across all columns...", lines=1 ) # HuggingFace-style table with row index on hover gr.HTML(""" """) df_display = gr.Dataframe( headers=["index"] + COLUMNS, datatype=["number", "str", "str", "str"], interactive=False, wrap=True, elem_classes="dataframe-container" ) stats_display = gr.Markdown("Click **Load Dataset** to begin exploring") df_state = gr.State(value=pd.DataFrame()) def load_and_display(dataset_name): df = load_preview(dataset_name) if "Error" in df.columns: return df, df, "Error loading dataset" stats = f"**Loaded {len(df)} rows** • Columns: {', '.join(COLUMNS)}" return df, df, stats load_btn.click( fn=load_and_display, inputs=dataset_dropdown, outputs=[df_state, df_display, stats_display] ) def search_and_display(df, query): if df.empty: return df, "Load a dataset first" filtered_df = filter_dataframe(df, query) stats = f"**Showing {len(filtered_df)} of {len(df)} rows**" if query: stats += f" • Search: '{query}'" return filtered_df, stats search_box.change( fn=search_and_display, inputs=[df_state, search_box], outputs=[df_display, stats_display] ) def send_to_model(df, row_num): if df.empty or row_num >= len(df): return "", "", "Invalid row or no data loaded" row = df.iloc[int(row_num)] instruction = row['english_prompt'] if 'english_prompt' in row else "" schema = row['create_statement'] if 'create_statement' in row else "" return instruction, schema, f"**Row {row_num} loaded!** Switch to Model Demo tab" send_to_model_btn.click( fn=send_to_model, inputs=[df_state, row_selector], outputs=[shared_instruction, shared_schema, stats_display] ) return {'df_state': df_state, 'df_display': df_display}