Spaces:

johneze
/

chichewa-text2sql

Sleeping

App Files Files Community

johneze commited on Feb 19

Commit

aa02a35

verified ·

1 Parent(s): e06e3a1

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +112 -42
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,31 +1,93 @@
 """
 Chichewa Text-to-SQL — HuggingFace Space
-Loads johneze/Llama-3.1-8B-Instruct-chichewa-text2sql and exposes a
-Gradio API endpoint that the Streamlit app (or anyone) can call.
-Uses ZeroGPU for free GPU access on HF Spaces.
 """
 from __future__ import annotations
 import re
 import spaces
 import gradio as gr
 import torch
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
-# Pre-download all model files to disk at startup (no GPU required).
-# When @spaces.GPU activates, from_pretrained reads from the local cache
-# instead of downloading — slashing first-call latency significantly.
 print("Downloading model weights to cache …")
 _model_cache = snapshot_download(repo_id=MODEL_ID)
 print(f"Model cached at: {_model_cache}")
-# Tokenizer is tiny — safe to load at startup without a GPU
 tokenizer = AutoTokenizer.from_pretrained(_model_cache)
-# Model is loaded lazily on the FIRST call inside @spaces.GPU where CUDA is live.
 _pipe = None
@@ -41,15 +103,12 @@ def extract_sql(text: str) -> str:
 @spaces.GPU(duration=300)
-def generate_sql(question: str, language: str = "ny") -> str:
     """
-    Generate SQL from a Chichewa or English question.
-    language: 'ny' for Chichewa, 'en' for English.
-    Returns a SQL SELECT statement.
     """
     global _pipe
     if _pipe is None:
-        # Weights already on disk — this only loads into VRAM (~30-60s)
         model = AutoModelForCausalLM.from_pretrained(
             _model_cache,
             dtype=torch.bfloat16,
@@ -58,42 +117,54 @@ def generate_sql(question: str, language: str = "ny") -> str:
         _pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
     lang_name = "Chichewa" if language == "ny" else "English"
     messages = [
         {
             "role": "system",
             "content": (
                 "You are an expert Text-to-SQL model for a SQLite database "
-                "with the following tables: production, population, food_insecurity, "
                 "commodity_prices, mse_daily. "
-                "Given a natural language question, generate ONE valid SQL SELECT query. "
-                "Return ONLY the SQL query, no explanation."
             ),
         },
-        {
-            "role": "user",
-            "content": f"Language: {lang_name}\nQuestion: {question}",
-        },
     ]
-    prompt = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    out = _pipe(
-        prompt,
-        max_new_tokens=128,
-        do_sample=False,
-        pad_token_id=tokenizer.eos_token_id,
-    )[0]["generated_text"]
-    generated = out[len(prompt):] if out.startswith(prompt) else out
-    return extract_sql(generated)
 # ── Gradio UI ──────────────────────────────────────────────────────────────
 with gr.Blocks(title="Chichewa Text-to-SQL") as demo:
-    gr.Markdown("# Chichewa Text-to-SQL\nEnter a question in Chichewa or English to generate SQL.")
     with gr.Row():
         question_box = gr.Textbox(
@@ -101,19 +172,18 @@ with gr.Blocks(title="Chichewa Text-to-SQL") as demo:
             placeholder="Ndi boma liti komwe anakolola chimanga chambiri?",
             lines=3,
         )
-        language_box = gr.Radio(
-            ["ny", "en"],
-            value="ny",
-            label="Language",
-        )
-    submit_btn = gr.Button("Generate SQL", variant="primary")
-    sql_output = gr.Code(label="Generated SQL", language="sql")
     submit_btn.click(
         fn=generate_sql,
         inputs=[question_box, language_box],
-        outputs=sql_output,
     )
     gr.Examples(

 """
 Chichewa Text-to-SQL — HuggingFace Space
+- Generates SQL from Chichewa/English questions using the fine-tuned model
+- Matches question against the training dataset (baseline retrieval)
+- Executes the SQL against the bundled SQLite database and returns results
 """
 from __future__ import annotations
+import json
 import re
+import sqlite3
+import difflib
+from pathlib import Path
 import spaces
 import gradio as gr
 import torch
+import pandas as pd
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
+# Files uploaded alongside app.py into the Space root
+_HERE = Path(__file__).parent
+DATA_PATH = _HERE / "data" / "all.json"
+DB_PATH   = _HERE / "data" / "database" / "chichewa_text2sql.db"
+FORBIDDEN = {"insert","update","delete","drop","alter","attach","pragma","create","replace","truncate"}
+# ── Dataset ────────────────────────────────────────────────────────────────
+_examples: list = []
+if DATA_PATH.exists():
+    with DATA_PATH.open("r", encoding="utf-8") as _f:
+        _examples = json.load(_f)
+    print(f"Loaded {len(_examples)} dataset examples.")
+else:
+    print(f"WARNING: dataset not found at {DATA_PATH}")
+def _norm(t: str) -> str:
+    return " ".join(t.lower().strip().split())
+def find_match(question: str, language: str):
+    key = "question_ny" if language == "ny" else "question_en"
+    q = _norm(question)
+    for ex in _examples:
+        if _norm(ex.get(key, "")) == q:
+            return ex, 1.0, "exact"
+    corpus = [_norm(ex.get(key, "")) for ex in _examples]
+    hits = difflib.get_close_matches(q, corpus, n=1, cutoff=0.5)
+    if hits:
+        idx = corpus.index(hits[0])
+        score = difflib.SequenceMatcher(None, q, hits[0]).ratio()
+        return _examples[idx], round(score, 3), "fuzzy"
+    return None, 0.0, "none"
+# ── SQL execution ──────────────────────────────────────────────────────────
+def run_query(sql: str):
+    """Validate and run a SELECT query. Returns (DataFrame | None, error_str | None)."""
+    s = sql.strip().rstrip(";")
+    if not s.lower().startswith("select"):
+        return None, "Only SELECT statements are allowed."
+    if ";" in s:
+        return None, "Multiple statements not allowed."
+    if any(kw in s.lower() for kw in FORBIDDEN):
+        return None, "Forbidden keyword detected."
+    if not DB_PATH.exists():
+        return None, f"Database not found at {DB_PATH}"
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    try:
+        rows = conn.execute(sql).fetchall()
+        if not rows:
+            return pd.DataFrame(), None
+        return pd.DataFrame([dict(r) for r in rows]), None
+    except Exception as exc:
+        return None, str(exc)
+    finally:
+        conn.close()
+# ── Model loading ──────────────────────────────────────────────────────────
 print("Downloading model weights to cache …")
 _model_cache = snapshot_download(repo_id=MODEL_ID)
 print(f"Model cached at: {_model_cache}")
 tokenizer = AutoTokenizer.from_pretrained(_model_cache)
 _pipe = None
 @spaces.GPU(duration=300)
+def generate_sql(question: str, language: str = "ny"):
     """
+    Returns (sql: str, match_info: str, results: pd.DataFrame)
     """
     global _pipe
     if _pipe is None:
         model = AutoModelForCausalLM.from_pretrained(
             _model_cache,
             dtype=torch.bfloat16,
         _pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
     lang_name = "Chichewa" if language == "ny" else "English"
     messages = [
         {
             "role": "system",
             "content": (
                 "You are an expert Text-to-SQL model for a SQLite database "
+                "with tables: production, population, food_insecurity, "
                 "commodity_prices, mse_daily. "
+                "Generate ONE valid SQL SELECT query. Return ONLY the SQL, no explanation."
             ),
         },
+        {"role": "user", "content": f"Language: {lang_name}\nQuestion: {question}"},
     ]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    out = _pipe(prompt, max_new_tokens=128, do_sample=False,
+                pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
+    generated = out[len(prompt):] if out.startswith(prompt) else out
+    sql = extract_sql(generated)
+    # ── Dataset match ──────────────────────────────────────────────────────
+    example, score, mode = find_match(question, language)
+    if example:
+        match_info = (
+            f"**Match:** {mode} (score: {score})\n\n"
+            f"**ny:** {example.get('question_ny', '')}\n\n"
+            f"**en:** {example.get('question_en', '')}\n\n"
+            f"**Dataset SQL:** `{example.get('sql_statement', '')}`\n\n"
+            f"**Table:** {example.get('table', '')} &nbsp;|&nbsp; "
+            f"**Difficulty:** {example.get('difficulty_level', '')}"
+        )
+    else:
+        match_info = "_No close match found in the dataset._"
+    # ── Execute SQL ────────────────────────────────────────────────────────
+    df, err = run_query(sql)
+    if err:
+        results = pd.DataFrame([{"error": err}])
+    elif df is not None and not df.empty:
+        results = df
+    else:
+        results = pd.DataFrame([{"info": "Query returned no rows."}])
+    return sql, match_info, results
 # ── Gradio UI ──────────────────────────────────────────────────────────────
 with gr.Blocks(title="Chichewa Text-to-SQL") as demo:
+    gr.Markdown("# 🌍 Chichewa Text-to-SQL\nEnter a question in Chichewa or English to generate SQL, match it against the dataset, and run it on the database.")
     with gr.Row():
         question_box = gr.Textbox(
             placeholder="Ndi boma liti komwe anakolola chimanga chambiri?",
             lines=3,
         )
+        language_box = gr.Radio(["ny", "en"], value="ny", label="Language")
+    submit_btn = gr.Button("Generate SQL & Run", variant="primary")
+    sql_output    = gr.Code(label="Generated SQL", language="sql")
+    match_output  = gr.Markdown(label="Dataset Match")
+    result_output = gr.Dataframe(label="Query Results", wrap=True)
     submit_btn.click(
         fn=generate_sql,
         inputs=[question_box, language_box],
+        outputs=[sql_output, match_output, result_output],
     )
     gr.Examples(

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ torch>=2.4.0
 accelerate>=0.34.0
 safetensors>=0.4.0
 spaces>=0.30.0
-bitsandbytes>=0.46.1

 accelerate>=0.34.0
 safetensors>=0.4.0
 spaces>=0.30.0
+bitsandbytes>=0.46.1
+pandas>=2.0.0