RealMati commited on
Commit
f0e347c
Β·
verified Β·
1 Parent(s): eec5488

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +310 -94
app.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import gradio as gr
4
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
  import torch
 
6
 
7
  MODEL_ID = "RealMati/t2sql_v6_structured"
8
 
@@ -13,9 +14,9 @@ model.eval()
13
  print("Model loaded.")
14
 
15
  AGG_OPS = ["", "MAX", "MIN", "COUNT", "SUM", "AVG"]
 
16
  OPS = ["=", ">", "<", ">=", "<=", "!="]
17
 
18
- # Load CSS from external file
19
  css_path = os.path.join(os.path.dirname(__file__), "style.css")
20
  with open(css_path, "r") as f:
21
  CSS = f.read()
@@ -90,7 +91,7 @@ def format_parsed(sel, agg, conds, columns):
90
  elif sel is not None:
91
  parts.append(f"Column index: {sel}")
92
  if agg is not None:
93
- agg_label = AGG_OPS[agg] if agg < len(AGG_OPS) and agg > 0 else "None"
94
  parts.append(f"Aggregation: {agg_label}")
95
  if conds:
96
  cond_strs = []
@@ -98,23 +99,38 @@ def format_parsed(sel, agg, conds, columns):
98
  c_name = columns[c_idx] if c_idx < len(columns) else f"col{c_idx}"
99
  op_str = OPS[c_op] if c_op < len(OPS) else "="
100
  cond_strs.append(f"{c_name} {op_str} {c_val}")
101
- parts.append(f"Conditions: {', '.join(cond_strs)}")
102
  else:
103
  parts.append("Conditions: None")
104
- return " | ".join(parts)
105
 
106
 
107
  def predict(question, schema, num_beams, max_length):
108
- if not question.strip():
109
- return "", "", ""
 
 
 
 
 
110
 
111
  table_name, columns = parse_schema(schema)
 
 
 
 
 
 
 
 
 
112
  input_text = f"translate to SQL: {question}"
113
  if schema.strip():
114
  input_text += f" | schema: {schema.strip()}"
115
 
116
  inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
117
 
 
118
  with torch.no_grad():
119
  outputs = model.generate(
120
  **inputs,
@@ -123,6 +139,7 @@ def predict(question, schema, num_beams, max_length):
123
  early_stopping=True,
124
  do_sample=False,
125
  )
 
126
 
127
  raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
128
  sel, agg, conds = decode_structured_output(raw_output)
@@ -130,10 +147,12 @@ def predict(question, schema, num_beams, max_length):
130
  if sel is not None and agg is not None and columns:
131
  sql = structured_to_sql(sel, agg, conds, columns, table_name)
132
  else:
133
- sql = "(Provide a schema to convert structured output to SQL)"
134
 
135
- parsed = format_parsed(sel, agg, conds, columns) if sel is not None else ""
136
- return sql, raw_output, parsed
 
 
137
 
138
 
139
  theme = gr.themes.Soft(
@@ -144,107 +163,304 @@ theme = gr.themes.Soft(
144
  font_mono=gr.themes.GoogleFont("Fira Code"),
145
  )
146
 
147
- with gr.Blocks(title="Text-to-SQL Demo") as demo:
148
- # Header
 
149
  gr.HTML("""
150
  <div class="main-header">
151
  <h1>Text-to-SQL</h1>
152
- <p>Fine-tuned T5 model that converts natural language questions
153
- into structured SQL queries using the WikiSQL dataset</p>
 
 
 
154
  </div>
155
  """)
156
 
157
- # Pipeline visualization - dark background so text is always visible
158
  gr.HTML("""
159
- <div class="pipeline-box">
160
- <span class="stage">Natural Language</span>
161
- <span class="arrow"> &rarr; </span>
162
- <span class="stage">T5 Encoder</span>
163
- <span class="arrow"> &rarr; </span>
164
- <span class="highlight">Structured Tokens (SEL | AGG | CONDS)</span>
165
- <span class="arrow"> &rarr; </span>
166
- <span class="stage">SQL Query</span>
167
  </div>
168
  """)
169
 
170
- with gr.Row(equal_height=True):
171
- with gr.Column(scale=1):
172
- gr.Markdown("### Input", elem_classes=["section-header"])
173
- question = gr.Textbox(
174
- label="Natural Language Question",
175
- placeholder="e.g. What is terrence ross' nationality?",
176
- lines=2,
177
- )
178
- schema = gr.Textbox(
179
- label="Database Schema",
180
- placeholder="table_name: col1, col2, col3, ...",
181
- lines=2,
182
- info="Format: table_name: column1, column2, column3",
183
- )
184
- with gr.Row():
185
- beams = gr.Slider(
186
- minimum=1, maximum=10, value=5, step=1,
187
- label="Beam Size",
188
- info="Higher = better quality, slower",
189
- )
190
- max_len = gr.Slider(
191
- minimum=64, maximum=512, value=256, step=64,
192
- label="Max Length",
193
- )
194
- btn = gr.Button("Generate SQL", variant="primary", elem_classes=["generate-btn"])
195
-
196
- with gr.Column(scale=1):
197
- gr.Markdown("### Output", elem_classes=["section-header"])
198
- sql_out = gr.Textbox(
199
- label="Generated SQL",
200
- lines=3,
201
- elem_classes=["sql-output"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  )
203
- raw_out = gr.Textbox(
204
- label="Raw Model Output (Structured Tokens)",
205
- lines=1,
206
- elem_classes=["raw-output"],
207
  )
208
- parsed_out = gr.Textbox(
209
- label="Decoded Components",
210
- lines=1,
211
- elem_classes=["raw-output"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  )
213
 
214
- btn.click(
215
- fn=predict,
216
- inputs=[question, schema, beams, max_len],
217
- outputs=[sql_out, raw_out, parsed_out],
218
- )
219
-
220
- gr.Markdown("### Try These Examples", elem_classes=["section-header"])
221
- gr.Examples(
222
- examples=[
223
- ["What is terrence ross' nationality", "players: Player, No., Nationality, Position, Years in Toronto, School/Club Team", 5, 256],
224
- ["how many schools or teams had jalen rose", "players: Player, No., Nationality, Position, Years in Toronto, School/Club Team", 5, 256],
225
- ["What was the date of the race in Misano?", "races: No, Date, Round, Circuit, Pole Position, Fastest Lap, Race winner, Report", 5, 256],
226
- ["What was the number of race that Kevin Curtain won?", "races: No, Date, Round, Circuit, Pole Position, Fastest Lap, Race winner, Report", 5, 256],
227
- ["Where was Assen held?", "races: No, Date, Round, Circuit, Pole Position, Fastest Lap, Race winner, Report", 5, 256],
228
- ["How many different positions did Sherbrooke Faucons (qmjhl) provide in the draft?", "draft: Pick, Player, Position, Nationality, NHL team, College/junior/club team", 5, 256],
229
- ["What are the nationalities of the player picked from Thunder Bay Flyers (ushl)", "draft: Pick, Player, Position, Nationality, NHL team, College/junior/club team", 5, 256],
230
- ["How many different nationalities do the players of New Jersey Devils come from?", "draft: Pick, Player, Position, Nationality, NHL team, College/junior/club team", 5, 256],
231
- ["What's Dorain Anneck's pick number?", "draft: Pick, Player, Position, Nationality, NHL team, College/junior/club team", 5, 256],
232
- ],
233
- inputs=[question, schema, beams, max_len],
234
- outputs=[sql_out, raw_out, parsed_out],
235
- fn=predict,
236
- cache_examples=False,
237
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
 
239
  gr.HTML("""
240
- <div class="footer-section">
241
- <span class="info-badge">T5-base</span>&nbsp;
242
- <span class="info-badge">WikiSQL</span>&nbsp;
243
- <span class="info-badge">Seq2Seq</span>&nbsp;
244
- <span class="info-badge">Structured Output</span>
245
- <p style="margin-top:0.75rem;">
246
- Model: <a href="https://huggingface.co/RealMati/t2sql_v6_structured" target="_blank">RealMati/t2sql_v6_structured</a>
247
- </p>
248
  </div>
249
  """)
250
 
 
3
  import gradio as gr
4
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
  import torch
6
+ import time
7
 
8
  MODEL_ID = "RealMati/t2sql_v6_structured"
9
 
 
14
  print("Model loaded.")
15
 
16
  AGG_OPS = ["", "MAX", "MIN", "COUNT", "SUM", "AVG"]
17
+ AGG_LABELS = ["None", "MAX", "MIN", "COUNT", "SUM", "AVG"]
18
  OPS = ["=", ">", "<", ">=", "<=", "!="]
19
 
 
20
  css_path = os.path.join(os.path.dirname(__file__), "style.css")
21
  with open(css_path, "r") as f:
22
  CSS = f.read()
 
91
  elif sel is not None:
92
  parts.append(f"Column index: {sel}")
93
  if agg is not None:
94
+ agg_label = AGG_LABELS[agg] if agg < len(AGG_LABELS) else str(agg)
95
  parts.append(f"Aggregation: {agg_label}")
96
  if conds:
97
  cond_strs = []
 
99
  c_name = columns[c_idx] if c_idx < len(columns) else f"col{c_idx}"
100
  op_str = OPS[c_op] if c_op < len(OPS) else "="
101
  cond_strs.append(f"{c_name} {op_str} {c_val}")
102
+ parts.append(f"Conditions: {' AND '.join(cond_strs)}")
103
  else:
104
  parts.append("Conditions: None")
105
+ return " | ".join(parts)
106
 
107
 
108
  def predict(question, schema, num_beams, max_length):
109
+ if not question or not question.strip():
110
+ return (
111
+ "-- Enter a question and schema above, then click Generate SQL",
112
+ "Waiting for input...",
113
+ "No query submitted yet",
114
+ "",
115
+ )
116
 
117
  table_name, columns = parse_schema(schema)
118
+
119
+ if not columns:
120
+ return (
121
+ "-- Please provide a database schema\n-- Format: table_name: col1, col2, col3",
122
+ "Cannot generate without schema",
123
+ "Schema is required to map column indices",
124
+ "",
125
+ )
126
+
127
  input_text = f"translate to SQL: {question}"
128
  if schema.strip():
129
  input_text += f" | schema: {schema.strip()}"
130
 
131
  inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
132
 
133
+ t0 = time.time()
134
  with torch.no_grad():
135
  outputs = model.generate(
136
  **inputs,
 
139
  early_stopping=True,
140
  do_sample=False,
141
  )
142
+ latency = time.time() - t0
143
 
144
  raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
145
  sel, agg, conds = decode_structured_output(raw_output)
 
147
  if sel is not None and agg is not None and columns:
148
  sql = structured_to_sql(sel, agg, conds, columns, table_name)
149
  else:
150
+ sql = f"-- Could not parse model output\n-- Raw: {raw_output}"
151
 
152
+ parsed = format_parsed(sel, agg, conds, columns) if sel is not None else "Parse failed"
153
+ latency_str = f"Inference: {latency:.2f}s | Beams: {int(num_beams)} | Input tokens: {inputs['input_ids'].shape[1]}"
154
+
155
+ return sql, raw_output, parsed, latency_str
156
 
157
 
158
  theme = gr.themes.Soft(
 
163
  font_mono=gr.themes.GoogleFont("Fira Code"),
164
  )
165
 
166
+ with gr.Blocks(title="Text-to-SQL | T5 Fine-tuned on WikiSQL") as demo:
167
+
168
+ # ── Header ──
169
  gr.HTML("""
170
  <div class="main-header">
171
  <h1>Text-to-SQL</h1>
172
+ <p class="tagline">A fine-tuned T5 encoder-decoder that translates natural language
173
+ into structured SQL via learned column &amp; operator indices</p>
174
+ <a class="model-link" href="https://huggingface.co/RealMati/t2sql_v6_structured" target="_blank">
175
+ View Model on HuggingFace
176
+ </a>
177
  </div>
178
  """)
179
 
180
+ # ── Tech Badges ──
181
  gr.HTML("""
182
+ <div class="tech-badges">
183
+ <span class="badge badge-indigo">T5-base (220M params)</span>
184
+ <span class="badge badge-purple">Seq2Seq</span>
185
+ <span class="badge badge-emerald">WikiSQL (80K+ examples)</span>
186
+ <span class="badge badge-amber">Structured Output</span>
 
 
 
187
  </div>
188
  """)
189
 
190
+ # ── Pipeline Strip ──
191
+ gr.HTML("""
192
+ <div class="pipeline-strip">
193
+ <span class="step step-input">Natural Language</span>
194
+ <span class="arrow">&rarr;</span>
195
+ <span class="step step-model">T5 Encoder-Decoder</span>
196
+ <span class="arrow">&rarr;</span>
197
+ <span class="step step-struct">SEL | AGG | CONDS</span>
198
+ <span class="arrow">&rarr;</span>
199
+ <span class="step step-sql">Executable SQL</span>
200
+ </div>
201
+ """)
202
+
203
+ # ── Tabs ──
204
+ with gr.Tabs():
205
+
206
+ # ═══════════ TAB 1: DEMO ═══════════
207
+ with gr.Tab("Demo"):
208
+ with gr.Row(equal_height=False):
209
+ # Left column β€” inputs
210
+ with gr.Column(scale=1):
211
+ gr.Markdown("#### Query Input")
212
+ question = gr.Textbox(
213
+ label="Natural Language Question",
214
+ placeholder="e.g. What is terrence ross' nationality?",
215
+ lines=2,
216
+ )
217
+ schema = gr.Textbox(
218
+ label="Database Schema",
219
+ placeholder="table_name: col1, col2, col3, ...",
220
+ lines=2,
221
+ )
222
+ gr.HTML('<p class="input-hint">Format: <code>table_name: column1, column2, column3</code> &mdash; column order matters (maps to indices)</p>')
223
+
224
+ with gr.Row():
225
+ beams = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Beam Size")
226
+ max_len = gr.Slider(minimum=64, maximum=512, value=256, step=64, label="Max Length")
227
+
228
+ btn = gr.Button("Generate SQL", variant="primary", elem_classes=["generate-btn"], size="lg")
229
+
230
+ # Right column β€” outputs
231
+ with gr.Column(scale=1):
232
+ gr.Markdown("#### Model Output")
233
+ sql_out = gr.Textbox(
234
+ label="Generated SQL",
235
+ value="-- Enter a question and schema above, then click Generate SQL",
236
+ lines=3,
237
+ elem_classes=["sql-output"],
238
+ )
239
+ with gr.Row():
240
+ raw_out = gr.Textbox(
241
+ label="Raw Structured Tokens",
242
+ value="Waiting for input...",
243
+ lines=1,
244
+ elem_classes=["decode-box"],
245
+ )
246
+ parsed_out = gr.Textbox(
247
+ label="Decoded Mapping",
248
+ value="No query submitted yet",
249
+ lines=1,
250
+ elem_classes=["decode-box"],
251
+ )
252
+ latency_out = gr.Textbox(
253
+ label="Performance",
254
+ value="",
255
+ lines=1,
256
+ elem_classes=["decode-box"],
257
+ )
258
+
259
+ btn.click(
260
+ fn=predict,
261
+ inputs=[question, schema, beams, max_len],
262
+ outputs=[sql_out, raw_out, parsed_out, latency_out],
263
  )
264
+ question.submit(
265
+ fn=predict,
266
+ inputs=[question, schema, beams, max_len],
267
+ outputs=[sql_out, raw_out, parsed_out, latency_out],
268
  )
269
+
270
+ # ── Examples ──
271
+ gr.Markdown("#### Example Queries")
272
+ gr.Examples(
273
+ examples=[
274
+ ["What is terrence ross' nationality", "players: Player, No., Nationality, Position, Years in Toronto, School/Club Team", 5, 256],
275
+ ["how many schools or teams had jalen rose", "players: Player, No., Nationality, Position, Years in Toronto, School/Club Team", 5, 256],
276
+ ["What was the date of the race in Misano?", "races: No, Date, Round, Circuit, Pole Position, Fastest Lap, Race winner, Report", 5, 256],
277
+ ["What was the number of race that Kevin Curtain won?", "races: No, Date, Round, Circuit, Pole Position, Fastest Lap, Race winner, Report", 5, 256],
278
+ ["Where was Assen held?", "races: No, Date, Round, Circuit, Pole Position, Fastest Lap, Race winner, Report", 5, 256],
279
+ ["How many different positions did Sherbrooke Faucons (qmjhl) provide in the draft?", "draft: Pick, Player, Position, Nationality, NHL team, College/junior/club team", 5, 256],
280
+ ["What are the nationalities of the player picked from Thunder Bay Flyers (ushl)", "draft: Pick, Player, Position, Nationality, NHL team, College/junior/club team", 5, 256],
281
+ ["How many different nationalities do the players of New Jersey Devils come from?", "draft: Pick, Player, Position, Nationality, NHL team, College/junior/club team", 5, 256],
282
+ ["What's Dorain Anneck's pick number?", "draft: Pick, Player, Position, Nationality, NHL team, College/junior/club team", 5, 256],
283
+ ],
284
+ inputs=[question, schema, beams, max_len],
285
+ outputs=[sql_out, raw_out, parsed_out, latency_out],
286
+ fn=predict,
287
+ cache_examples=False,
288
  )
289
 
290
+ # ═══════════ TAB 2: HOW IT WORKS ═══════════
291
+ with gr.Tab("How It Works"):
292
+ gr.HTML("""
293
+ <div class="arch-section">
294
+
295
+ <div class="arch-card">
296
+ <h3>Architecture Overview</h3>
297
+ <p>This system uses a <strong>T5-base</strong> (Text-to-Text Transfer Transformer) model
298
+ fine-tuned on the <strong>WikiSQL</strong> dataset. Instead of generating raw SQL strings directly,
299
+ the model outputs <em>structured tokens</em> that encode the query as column indices and operator codes.
300
+ A deterministic decoder then maps these indices back to actual column names using the provided schema.</p>
301
+ </div>
302
+
303
+ <div class="arch-grid">
304
+ <div class="arch-card">
305
+ <h3>1. Input Encoding</h3>
306
+ <p>The natural language question and database schema are concatenated into a single input string:</p>
307
+ <p><code>translate to SQL: {question} | schema: {table}: {col1}, {col2}, ...</code></p>
308
+ <p>The schema provides the column vocabulary. Column order is critical &mdash;
309
+ the model references columns by their <strong>positional index</strong> (0-based).</p>
310
+ </div>
311
+
312
+ <div class="arch-card">
313
+ <h3>2. T5 Generation</h3>
314
+ <p>The encoder processes the full input sequence. The decoder then generates structured tokens
315
+ using beam search (default: 5 beams) with early stopping.</p>
316
+ <p>Output format: <code>SEL:{col_idx} | AGG:{agg_idx} | CONDS:{col},{op},{val};...</code></p>
317
+ </div>
318
+
319
+ <div class="arch-card">
320
+ <h3>3. Structured Decoding</h3>
321
+ <p>The raw token string is parsed into three components:</p>
322
+ <ul style="margin:0.5rem 0; padding-left:1.2rem;">
323
+ <li><strong>SEL</strong> &mdash; which column to SELECT (index into schema)</li>
324
+ <li><strong>AGG</strong> &mdash; aggregation function (0=none, 1=MAX, 2=MIN, 3=COUNT, 4=SUM, 5=AVG)</li>
325
+ <li><strong>CONDS</strong> &mdash; WHERE conditions as <code>col_idx,op_idx,value</code> tuples</li>
326
+ </ul>
327
+ </div>
328
+
329
+ <div class="arch-card">
330
+ <h3>4. SQL Assembly</h3>
331
+ <p>Column indices are mapped back to actual column names from the schema. Operator indices
332
+ are converted to SQL operators (=, >, <, >=, <=, !=). The components are assembled into
333
+ a valid SQL query with proper quoting and escaping.</p>
334
+ </div>
335
+ </div>
336
+
337
+ <div class="arch-card">
338
+ <h3>Why Structured Output?</h3>
339
+ <p>Generating SQL as structured indices rather than free-form text provides several advantages:</p>
340
+ <ul style="margin:0.5rem 0; padding-left:1.2rem;">
341
+ <li><strong>Schema-agnostic</strong> &mdash; The model learns query patterns, not specific column names.
342
+ It generalizes across any table schema.</li>
343
+ <li><strong>Syntactically valid</strong> &mdash; The deterministic decoder guarantees well-formed SQL.
344
+ No risk of misspelled keywords or broken syntax.</li>
345
+ <li><strong>Smaller output space</strong> &mdash; The model only needs to predict a few integers and condition values,
346
+ reducing the search space and improving accuracy.</li>
347
+ <li><strong>Interpretable</strong> &mdash; Each component (SEL, AGG, CONDS) can be inspected independently,
348
+ making debugging and analysis straightforward.</li>
349
+ </ul>
350
+ </div>
351
+
352
+ <div class="arch-card">
353
+ <h3>Encoding Reference</h3>
354
+ <table class="encoding-table">
355
+ <tr>
356
+ <th>Component</th>
357
+ <th>Index</th>
358
+ <th>Meaning</th>
359
+ </tr>
360
+ <tr><td rowspan="6"><strong>AGG</strong></td>
361
+ <td class="mono">0</td><td>No aggregation (plain SELECT)</td></tr>
362
+ <tr><td class="mono">1</td><td>MAX</td></tr>
363
+ <tr><td class="mono">2</td><td>MIN</td></tr>
364
+ <tr><td class="mono">3</td><td>COUNT</td></tr>
365
+ <tr><td class="mono">4</td><td>SUM</td></tr>
366
+ <tr><td class="mono">5</td><td>AVG</td></tr>
367
+ <tr><td rowspan="6"><strong>OP</strong> (in CONDS)</td>
368
+ <td class="mono">0</td><td>= (equals)</td></tr>
369
+ <tr><td class="mono">1</td><td>> (greater than)</td></tr>
370
+ <tr><td class="mono">2</td><td>< (less than)</td></tr>
371
+ <tr><td class="mono">3</td><td>>= (greater or equal)</td></tr>
372
+ <tr><td class="mono">4</td><td><= (less or equal)</td></tr>
373
+ <tr><td class="mono">5</td><td>!= (not equal)</td></tr>
374
+ </table>
375
+ </div>
376
+ </div>
377
+ """)
378
+
379
+ # ═══════════ TAB 3: MODEL INFO ═══════════
380
+ with gr.Tab("Model & Training"):
381
+ gr.HTML("""
382
+ <div class="arch-section">
383
+
384
+ <div class="stats-grid">
385
+ <div class="stat-card">
386
+ <div class="stat-value">220M</div>
387
+ <div class="stat-label">Parameters</div>
388
+ </div>
389
+ <div class="stat-card">
390
+ <div class="stat-value">80K+</div>
391
+ <div class="stat-label">Training Examples</div>
392
+ </div>
393
+ <div class="stat-card">
394
+ <div class="stat-value">T5-base</div>
395
+ <div class="stat-label">Architecture</div>
396
+ </div>
397
+ <div class="stat-card">
398
+ <div class="stat-value">WikiSQL</div>
399
+ <div class="stat-label">Dataset</div>
400
+ </div>
401
+ </div>
402
+
403
+ <div class="arch-grid">
404
+ <div class="arch-card">
405
+ <h3>Model Architecture</h3>
406
+ <ul style="margin:0.5rem 0; padding-left:1.2rem;">
407
+ <li><strong>Base model:</strong> T5-base (encoder-decoder transformer)</li>
408
+ <li><strong>Tokenizer:</strong> SentencePiece (32K vocabulary)</li>
409
+ <li><strong>Max input length:</strong> 512 tokens</li>
410
+ <li><strong>Max output length:</strong> 256 tokens</li>
411
+ <li><strong>Decoding:</strong> Beam search (default 5 beams)</li>
412
+ <li><strong>Framework:</strong> HuggingFace Transformers + PyTorch</li>
413
+ </ul>
414
+ </div>
415
+
416
+ <div class="arch-card">
417
+ <h3>Training Details</h3>
418
+ <ul style="margin:0.5rem 0; padding-left:1.2rem;">
419
+ <li><strong>Dataset:</strong> WikiSQL (Zhong et al., 2017)</li>
420
+ <li><strong>Train split:</strong> ~56,355 examples</li>
421
+ <li><strong>Dev split:</strong> ~8,421 examples</li>
422
+ <li><strong>Test split:</strong> ~15,878 examples</li>
423
+ <li><strong>Output format:</strong> Structured tokens (SEL/AGG/CONDS)</li>
424
+ <li><strong>Task prefix:</strong> <code>translate to SQL:</code></li>
425
+ </ul>
426
+ </div>
427
+
428
+ <div class="arch-card">
429
+ <h3>Dataset: WikiSQL</h3>
430
+ <p>WikiSQL is a large-scale dataset of 80,654 hand-annotated SQL queries and natural language
431
+ questions corresponding to 24,241 tables from Wikipedia. Each query operates on a single table
432
+ and supports SELECT, aggregation (COUNT, SUM, MAX, MIN, AVG), and WHERE conditions
433
+ with comparison operators.</p>
434
+ <p style="margin-top:0.5rem;">
435
+ <a href="https://github.com/salesforce/WikiSQL" target="_blank" style="color:#667eea;">
436
+ github.com/salesforce/WikiSQL
437
+ </a>
438
+ </p>
439
+ </div>
440
+
441
+ <div class="arch-card">
442
+ <h3>Limitations</h3>
443
+ <ul style="margin:0.5rem 0; padding-left:1.2rem;">
444
+ <li><strong>Single-table only</strong> &mdash; No JOINs or subqueries (WikiSQL constraint)</li>
445
+ <li><strong>Fixed operators</strong> &mdash; Limited to =, >, <, >=, <=, != </li>
446
+ <li><strong>No GROUP BY / ORDER BY</strong> &mdash; Not in the WikiSQL schema</li>
447
+ <li><strong>AND-only conditions</strong> &mdash; Multiple conditions are joined with AND</li>
448
+ <li><strong>Schema required</strong> &mdash; Column names and order must be provided</li>
449
+ </ul>
450
+ </div>
451
+ </div>
452
+ </div>
453
+ """)
454
 
455
+ # ── Footer ──
456
  gr.HTML("""
457
+ <div class="app-footer">
458
+ Built with <a href="https://huggingface.co/docs/transformers" target="_blank">Transformers</a>
459
+ &amp; <a href="https://gradio.app" target="_blank">Gradio</a>
460
+ &nbsp;&bull;&nbsp;
461
+ Model: <a href="https://huggingface.co/RealMati/t2sql_v6_structured" target="_blank">RealMati/t2sql_v6_structured</a>
462
+ &nbsp;&bull;&nbsp;
463
+ Dataset: <a href="https://github.com/salesforce/WikiSQL" target="_blank">WikiSQL</a>
 
464
  </div>
465
  """)
466