Spaces:

johneze
/

chichewa-text2sql

Sleeping

App Files Files Community

johneze commited on 16 days ago

Commit

8cf79dd

verified ·

1 Parent(s): 613d990

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +33 -123

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 Chichewa Text-to-SQL — HuggingFace Space
 - Generates SQL from Chichewa/English questions using the fine-tuned model
-- Matches question against the training dataset (baseline retrieval)
 - Executes the SQL against the bundled SQLite database and returns results
 """
 from __future__ import annotations
@@ -21,12 +21,14 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
-# Files uploaded alongside app.py into the Space root
-_HERE = Path(__file__).parent
 DATA_PATH = _HERE / "data" / "all.json"
 DB_PATH   = _HERE / "data" / "database" / "chichewa_text2sql.db"
-FORBIDDEN = {"insert","update","delete","drop","alter","attach","pragma","create","replace","truncate"}
 # ── Dataset ────────────────────────────────────────────────────────────────
 _examples: list = []
@@ -57,9 +59,20 @@ def find_match(question: str, language: str):
     return None, 0.0, "none"
-# ── SQL execution ──────────────────────────────────────────────────────────
 def run_query(sql: str):
-    """Validate and run a SELECT query. Returns (DataFrame | None, error_str | None)."""
     s = sql.strip().rstrip(";")
     if not s.lower().startswith("select"):
         return None, "Only SELECT statements are allowed."
@@ -82,8 +95,8 @@ def run_query(sql: str):
         conn.close()
-# ── Model loading ──────────────────────────────────────────────────────────
-print("Downloading model weights to cache …")
 _model_cache = snapshot_download(repo_id=MODEL_ID)
 print(f"Model cached at: {_model_cache}")
@@ -91,22 +104,10 @@ tokenizer = AutoTokenizer.from_pretrained(_model_cache)
 _pipe = None
-def extract_sql(text: str) -> str:
-    match = re.search(r"(?is)select\s.+", text)
-    if not match:
-        return text.strip()
-    sql = match.group(0)
-    for sep in [";", "\n"]:
-        if sep in sql:
-            sql = sql.split(sep)[0]
-    return sql.strip() + ";"
 @spaces.GPU(duration=300)
 def generate_sql(question: str, language: str = "ny"):
-    """
-    Returns (sql: str, match_info: str, results: pd.DataFrame)
-    """
     global _pipe
     if _pipe is None:
         model = AutoModelForCausalLM.from_pretrained(
@@ -129,14 +130,13 @@ def generate_sql(question: str, language: str = "ny"):
         },
         {"role": "user", "content": f"Language: {lang_name}\nQuestion: {question}"},
     ]
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     out = _pipe(prompt, max_new_tokens=128, do_sample=False,
                 pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
     generated = out[len(prompt):] if out.startswith(prompt) else out
     sql = extract_sql(generated)
-    # ── Dataset match ──────────────────────────────────────────────────────
     example, score, mode = find_match(question, language)
     if example:
         match_info = (
@@ -144,13 +144,13 @@ def generate_sql(question: str, language: str = "ny"):
             f"**ny:** {example.get('question_ny', '')}\n\n"
             f"**en:** {example.get('question_en', '')}\n\n"
             f"**Dataset SQL:** `{example.get('sql_statement', '')}`\n\n"
-            f"**Table:** {example.get('table', '')} &nbsp;|&nbsp; "
             f"**Difficulty:** {example.get('difficulty_level', '')}"
         )
     else:
         match_info = "_No close match found in the dataset._"
-    # ── Execute SQL ────────────────────────────────────────────────────────
     df, err = run_query(sql)
     if err:
         results = pd.DataFrame([{"error": err}])
@@ -164,7 +164,11 @@ def generate_sql(question: str, language: str = "ny"):
 # ── Gradio UI ──────────────────────────────────────────────────────────────
 with gr.Blocks(title="Chichewa Text-to-SQL") as demo:
-    gr.Markdown("# 🌍 Chichewa Text-to-SQL\nEnter a question in Chichewa or English to generate SQL, match it against the dataset, and run it on the database.")
     with gr.Row():
         question_box = gr.Textbox(
@@ -177,7 +181,7 @@ with gr.Blocks(title="Chichewa Text-to-SQL") as demo:
     submit_btn = gr.Button("Generate SQL & Run", variant="primary")
     sql_output    = gr.Code(label="Generated SQL", language="sql")
-    match_output  = gr.Markdown(label="Dataset Match")
     result_output = gr.Dataframe(label="Query Results", wrap=True)
     submit_btn.click(
@@ -196,98 +200,4 @@ with gr.Blocks(title="Chichewa Text-to-SQL") as demo:
         inputs=[question_box, language_box],
     )
-if __name__ == "__main__":
-    demo.launch()
-def extract_sql(text: str) -> str:
-    match = re.search(r"(?is)select\s.+", text)
-    if not match:
-        return text.strip()
-    sql = match.group(0)
-    for sep in [";", "\n"]:
-        if sep in sql:
-            sql = sql.split(sep)[0]
-    return sql.strip() + ";"
-@spaces.GPU
-def generate_sql(question: str, language: str = "ny") -> str:
-    """
-    Generate SQL from a Chichewa or English question.
-    language: 'ny' for Chichewa, 'en' for English.
-    Returns a SQL SELECT statement.
-    """
-    lang_name = "Chichewa" if language == "ny" else "English"
-    messages = [
-        {
-            "role": "system",
-            "content": (
-                "You are an expert Text-to-SQL model for a SQLite database "
-                "with the following tables: production, population, food_insecurity, "
-                "commodity_prices, mse_daily. "
-                "Given a natural language question, generate ONE valid SQL SELECT query. "
-                "Return ONLY the SQL query, no explanation."
-            ),
-        },
-        {
-            "role": "user",
-            "content": f"Language: {lang_name}\nQuestion: {question}",
-        },
-    ]
-    prompt = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    out = pipe(
-        prompt,
-        max_new_tokens=128,
-        do_sample=False,
-        pad_token_id=tokenizer.eos_token_id,
-    )[0]["generated_text"]
-    generated = out[len(prompt):] if out.startswith(prompt) else out
-    return extract_sql(generated)
-# ── Gradio UI ──────────────────────────────────────────────────────────────
-with gr.Blocks(title="Chichewa Text-to-SQL") as demo:
-    gr.Markdown("# 🌍 Chichewa Text-to-SQL\nEnter a question in Chichewa or English to generate SQL.")
-    with gr.Row():
-        question_box = gr.Textbox(
-            label="Question",
-            placeholder="Ndi boma liti komwe anakolola chimanga chambiri?",
-            lines=3,
-        )
-        language_box = gr.Radio(
-            ["ny", "en"],
-            value="ny",
-            label="Language",
-        )
-    submit_btn = gr.Button("Generate SQL", variant="primary")
-    sql_output = gr.Code(label="Generated SQL", language="sql")
-    submit_btn.click(
-        fn=generate_sql,
-        inputs=[question_box, language_box],
-        outputs=sql_output,
-    )
-    gr.Examples(
-        examples=[
-            ["Ndi boma liti komwe anakolola chimanga chambiri?", "ny"],
-            ["Which district produced the most Maize?", "en"],
-            ["Ndi anthu angati ku Lilongwe?", "ny"],
-            ["What is the food insecurity level in Nsanje?", "en"],
-        ],
-        inputs=[question_box, language_box],
-    )
-if __name__ == "__main__":
-    demo.launch()

 """
 Chichewa Text-to-SQL — HuggingFace Space
 - Generates SQL from Chichewa/English questions using the fine-tuned model
+- Matches question against the dataset (fuzzy retrieval)
 - Executes the SQL against the bundled SQLite database and returns results
 """
 from __future__ import annotations
 MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
+_HERE     = Path(__file__).parent
 DATA_PATH = _HERE / "data" / "all.json"
 DB_PATH   = _HERE / "data" / "database" / "chichewa_text2sql.db"
+FORBIDDEN = {
+    "insert", "update", "delete", "drop", "alter",
+    "attach", "pragma", "create", "replace", "truncate",
+}
 # ── Dataset ────────────────────────────────────────────────────────────────
 _examples: list = []
     return None, 0.0, "none"
+# ── SQL helpers ────────────────────────────────────────────────────────────
+def extract_sql(text: str) -> str:
+    m = re.search(r"(?is)select\s.+", text)
+    if not m:
+        return text.strip()
+    sql = m.group(0)
+    for sep in [";", "\n"]:
+        if sep in sql:
+            sql = sql.split(sep)[0]
+    return sql.strip() + ";"
 def run_query(sql: str):
+    """Returns (DataFrame | None, error_str | None)."""
     s = sql.strip().rstrip(";")
     if not s.lower().startswith("select"):
         return None, "Only SELECT statements are allowed."
         conn.close()
+# ── Model (pre-download weights at startup, load into GPU on first call) ───
+print("Downloading model weights to cache ...")
 _model_cache = snapshot_download(repo_id=MODEL_ID)
 print(f"Model cached at: {_model_cache}")
 _pipe = None
+# ── Main function ──────────────────────────────────────────────────────────
 @spaces.GPU(duration=300)
 def generate_sql(question: str, language: str = "ny"):
+    """Returns (sql, match_info_markdown, results_dataframe)."""
     global _pipe
     if _pipe is None:
         model = AutoModelForCausalLM.from_pretrained(
         },
         {"role": "user", "content": f"Language: {lang_name}\nQuestion: {question}"},
     ]
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     out = _pipe(prompt, max_new_tokens=128, do_sample=False,
                 pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
     generated = out[len(prompt):] if out.startswith(prompt) else out
     sql = extract_sql(generated)
+    # Dataset match
     example, score, mode = find_match(question, language)
     if example:
         match_info = (
             f"**ny:** {example.get('question_ny', '')}\n\n"
             f"**en:** {example.get('question_en', '')}\n\n"
             f"**Dataset SQL:** `{example.get('sql_statement', '')}`\n\n"
+            f"**Table:** {example.get('table', '')} | "
             f"**Difficulty:** {example.get('difficulty_level', '')}"
         )
     else:
         match_info = "_No close match found in the dataset._"
+    # Execute SQL
     df, err = run_query(sql)
     if err:
         results = pd.DataFrame([{"error": err}])
 # ── Gradio UI ──────────────────────────────────────────────────────────────
 with gr.Blocks(title="Chichewa Text-to-SQL") as demo:
+    gr.Markdown(
+        "# Chichewa Text-to-SQL\n"
+        "Enter a question in **Chichewa** or **English** to generate SQL, "
+        "match it against the dataset, and run it on the database."
+    )
     with gr.Row():
         question_box = gr.Textbox(
     submit_btn = gr.Button("Generate SQL & Run", variant="primary")
     sql_output    = gr.Code(label="Generated SQL", language="sql")
+    match_output  = gr.Markdown()
     result_output = gr.Dataframe(label="Query Results", wrap=True)
     submit_btn.click(
         inputs=[question_box, language_box],
     )
+demo.launch()