Spaces:

hmm404
/

WASH_NL2SQL

Sleeping

App Files Files Community

hmm404 commited on Oct 22, 2025

Commit

530a106

verified ·

1 Parent(s): 7a7a0e6

Upload 3 files

Browse files

Files changed (3) hide show

app.py +242 -0
requirements.txt +3 -0
schema.py +297 -0

app.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import os
+import re
+import sqlite3
+import warnings
+import gradio as gr
+import pandas as pd
+from schema import schema
+from langchain_nvidia_ai_endpoints import ChatNVIDIA
+warnings.filterwarnings("ignore")
+API_KEY = "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"
+db_path = "wash_db.db"
+client = ChatNVIDIA(
+    model="deepseek-ai/deepseek-r1",
+    api_key=API_KEY,
+    temperature=0.1,
+    top_p=1,
+    max_tokens=4096,
+)
+def get_table_names(schema: str):
+    return re.findall(r'TABLE (\w+)', schema)
+def get_table_columns(schema: str, table: str):
+    m = re.search(rf'TABLE {table} \((.*?)\)', schema, re.DOTALL)
+    if m:
+        cols_block = m.group(1)
+        cols = re.findall(r'(\w+)', cols_block)
+        return [col for col in cols if col.lower() not in {"int", "primary", "key", "string", "bit", "real", "references"}]
+    return []
+def agent_select_table(user_query, schema):
+    tables = get_table_names(schema)
+    # First, try longest keyword containment in table name
+    best = ""
+    best_len = 0
+    for table in tables:
+        for word in user_query.lower().split():
+            if word in table.lower() and len(word) > best_len:
+                best = table
+                best_len = len(word)
+    if best:
+        return best
+    # fallback: first table
+    return tables[0]
+def agent_select_columns(user_query, table, schema):
+    columns = get_table_columns(schema, table)
+    selected = []
+    for col in columns:
+        if any(word in col.lower() for word in user_query.lower().split()):
+            selected.append(col)
+    return selected if selected else columns  # fallback all columns
+def build_sql_prompt(table, columns, schema, user_question, error_reason=None):
+    prompt = (
+        f"You are an expert SQL assistant.\n"
+        f"Schema: {schema}\n"
+        # f"Columns: {', '.join(columns)}\n"
+        f"User question: {user_question}\n"
+        "Write a valid SQLite SQL query answering the question using only the given table and columns.\n"
+    )
+    if error_reason:
+        prompt += f"The previous SQL query failed with the error: {error_reason}\nPlease fix and regenerate the SQL only."
+    return prompt
+def extract_sql_query(text):
+    patterns = [
+        r"```sql\n(.*?)```",
+        r"```\n(.*?)```",
+        r"```(.*?)```",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
+        if match:
+            return match.group(1).strip()
+    # Else, look for SELECT...;
+    match = re.search(r"(SELECT|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER).*?;", text, re.DOTALL | re.IGNORECASE)
+    if match:
+        return match.group(0).strip()
+    lines = text.split('\n')
+    sql_lines = [l for l in lines if any(k in l.upper() for k in ['SELECT', 'FROM', 'WHERE', 'INSERT', 'UPDATE', 'DELETE'])]
+    if sql_lines:
+        return ' '.join(sql_lines)
+    return text.strip()
+def execute_sql_query(sql_query, db_path=db_path):
+    try:
+        conn = sqlite3.connect(db_path)
+        df = pd.read_sql_query(sql_query, conn)
+        conn.close()
+        return df, None
+    except Exception as e:
+        return None, str(e)
+def summarize_with_llm(table, columns, data, user_query):
+    preview = data.head(5).to_markdown(index=False) if data is not None and not data.empty else "No data returned."
+    prompt = (
+        f"User query: {user_query}\n"
+        f"SQL result preview \n{preview}\n"
+        f"Summarize the result, referencing the user query and the preview.)."
+    )
+    resp = client.invoke([{"role": "user", "content": prompt}])
+    return getattr(resp, "content", resp) if hasattr(resp, "content") else str(resp)
+# def full_pipeline(user_question):
+#     table = agent_select_table(user_question, schema)
+#     columns = agent_select_columns(user_question, table, schema)
+# yield {
+#     table_output: gr.update(value=table),
+#     columns_output: gr.update(value=", ".join(columns)),
+# }
+#     sql_prompt = build_sql_prompt(table, columns, user_question)
+#     sql_query, error = "", None
+#     # Error-handling and retry loop
+#     for _ in range(5):
+#         llm_resp = client.invoke([{"role": "user", "content": sql_prompt}])
+#         llm_text = getattr(llm_resp, "content", llm_resp) if hasattr(llm_resp, "content") else str(llm_resp)
+#         sql_query = extract_sql_query(llm_text)
+#         results_df, error = execute_sql_query(sql_query)
+#         if not error:
+#             break
+#         sql_prompt = build_sql_prompt(table, columns, user_question, error_reason=error)
+#     # Summarize
+#     summary = summarize_with_llm(table, columns, results_df, user_question)
+#     # Format outputs
+#     columns_view = ", ".join(columns)
+#     sql_view = f"```sql\n{sql_query}\n```"
+#     status_view = f"Success" if not error else f"Query error: {error}"
+#     out_df = results_df if results_df is not None else pd.DataFrame()
+#     return sql_view, status_view, summary, table, columns_view, out_df
+def full_pipeline_stream(user_question):
+    yield "Identifying relevant table and columns...", "", "", "", "", pd.DataFrame()
+    table = agent_select_table(user_question, schema)
+    columns = agent_select_columns(user_question, table, schema)
+    yield f"Table '{table}' selected.", "", "", table, ", ".join(columns), pd.DataFrame()
+    sql_prompt = build_sql_prompt(table, columns, user_question)
+    sql_query, error = "", None
+    for _ in range(5):
+        yield f"Generating SQL (attempt {_+1})...", "", "", table, ", ".join(columns), pd.DataFrame()
+        llm_resp = client.invoke([{"role": "user", "content": sql_prompt}])
+        llm_text = getattr(llm_resp, "content", llm_resp) if hasattr(llm_resp, "content") else str(llm_resp)
+        sql_query = extract_sql_query(llm_text)
+        results_df, error = execute_sql_query(sql_query)
+        if not error:
+            yield f"SQL executed successfully.", f"``````", "", table, ", ".join(columns), results_df
+            break
+        sql_prompt = build_sql_prompt(table, columns, user_question, error_reason=error)
+        yield f"Retrying due to error: {error}", f"``````", "", table, ", ".join(columns), pd.DataFrame()
+    if not error:
+        summary = summarize_with_llm(table, columns, results_df, user_question)
+        yield "Summarization complete.", f"``````", summary, table, ", ".join(columns), results_df
+    else:
+        yield f"Final error: {error}", f"``````", "No summary due to error.", table, ", ".join(columns), pd.DataFrame()
+def full_pipeline(user_question):
+    # Step 1: Identify table and columns first
+    yield "", "", "", "", "", pd.DataFrame()
+    table = agent_select_table(user_question, schema)
+    columns = agent_select_columns(user_question, table, schema)
+    # Immediately return only these two visible outputs
+    yield {
+        table_output: gr.update(value=table),
+        columns_output: gr.update(value=", ".join(columns)),
+    }
+    # Step 2: Continue with downstream pipeline
+    sql_prompt = build_sql_prompt(table, columns, schema, user_question)
+    sql_query, error = "", None
+    for _ in range(5):
+        llm_resp = client.invoke([{"role": "user", "content": sql_prompt}])
+        llm_text = getattr(llm_resp, "content", llm_resp) if hasattr(llm_resp, "content") else str(llm_resp)
+        sql_query = extract_sql_query(llm_text)
+        results_df, error = execute_sql_query(sql_query)
+        if not error:
+            break
+        sql_prompt = build_sql_prompt(table, columns, schema, user_question, error_reason=error)
+    sql_view = f"\n{sql_query.strip()}\n"
+    status_view = "Success" if not error else f"Query error: {error}"
+    out_df = results_df if results_df is not None else pd.DataFrame()
+    yield {
+        sql_output: gr.update(value=sql_view),
+        status_output: gr.update(value=status_view),
+        results_output: gr.update(value=out_df)
+    }
+    summary = summarize_with_llm(table, columns, results_df, user_question).strip()
+    yield {
+        # sql_output: gr.update(value=sql_view),
+        summary_output: gr.update(value=summary),
+    }
+with gr.Blocks(title="NL2SQL Pipeline)") as gradio_interface:
+    gr.Markdown("## NL2SQL  Pipeline ")
+    gr.Markdown("Enter a question about the water supply database. The agent will select relevant table/columns, generate and retry SQL on error, show results and a grounded summary.")
+    with gr.Row():
+        input_text = gr.Textbox(label="Enter your natural language question", lines=3)
+    with gr.Row():
+        submit_btn = gr.Button("Generate, Execute & Summarize", variant="primary")
+    with gr.Row():
+        table_output = gr.Textbox(label="Table Used", lines=1)
+        columns_output = gr.Textbox(label="Columns Used", lines=2)
+    with gr.Row():
+        sql_output = gr.Textbox(label="Generated SQL Query", lines=5)
+    with gr.Row():
+        status_output = gr.Textbox(label="Execution Status", lines=2)
+    with gr.Row():
+        results_output = gr.Dataframe(label="Query Results", interactive=False)
+    with gr.Row():
+        summary_output = gr.Textbox(label="LLM-Grounded Summary", lines=5)
+    with gr.Row():
+        abort_btn = gr.Button("Abort / Stop Task")
+    running_event=submit_btn.click(
+        fn=full_pipeline,
+        inputs=input_text,
+        outputs=[sql_output, status_output, summary_output, table_output, columns_output, results_output]
+    )
+    abort_btn.click(
+        None,
+        inputs=None,
+        outputs=None,
+        cancels=[running_event],
+        queue=False
+    )
+if __name__ == "__main__":
+    gradio_interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio
+pandas
+langchain-nvidia-ai-endpoints

schema.py ADDED Viewed

	@@ -0,0 +1,297 @@

+schema = """
+TABLE states (
+    state_id INTEGER PRIMARY KEY,
+    lgd_state_id INTEGER NOT NULL,
+    state_name TEXT NOT NULL,
+    census_state INTEGER NOT NULL
+)
+TABLE districts (
+    district_id INTEGER PRIMARY KEY,
+    lgd_district_id INTEGER NOT NULL,
+    district_name TEXT NOT NULL,
+    census_district INTEGER NOT NULL
+)
+TABLE blocks (
+    block_id INTEGER PRIMARY KEY,
+    lgd_block_id INTEGER NOT NULL,
+    block_name TEXT NOT NULL
+)
+TABLE panchayats (
+    panchayat_id INTEGER PRIMARY KEY,
+    lgd_panchayat_id INTEGER NOT NULL,
+    panchayat_name TEXT NOT NULL
+)
+TABLE divisions (
+    division_id INTEGER PRIMARY KEY,
+    division_name TEXT NOT NULL
+)
+TABLE villages (
+    village_id INTEGER PRIMARY KEY,
+    lgd_village_id INTEGER NOT NULL,
+    village_name TEXT NOT NULL,
+    census_village TEXT NOT NULL,
+    village_type TEXT NOT NULL,
+    village_status TEXT NOT NULL,
+    vap_status TEXT NOT NULL,
+    vwsc_formed INTEGER NOT NULL,
+    village_certificate INTEGER NOT NULL,
+    gp_resolution INTEGER NOT NULL,
+    declaration_video INTEGER NOT NULL,
+    total_no_households INTEGER NOT NULL,
+    total_no_house_connection INTEGER NOT NULL,
+    no_of_ftk_trained_women INTEGER NOT NULL,
+    no_of_school INTEGER NOT NULL,
+    school_with_tap_connection INTEGER NOT NULL,
+    no_of_aws INTEGER NOT NULL,
+    no_of_aws_with_tap_connection INTEGER NOT NULL,
+    total_pop INTEGER NOT NULL,
+    gen_pop INTEGER NOT NULL,
+    sc_pop INTEGER NOT NULL,
+    st_pop INTEGER NOT NULL,
+    sanctioned_approved_status INTEGER,
+    work_order_updated_status INTEGER,
+    scheme_is_work_started_status INTEGER
+)
+TABLE habitations (
+    habitation_id INTEGER PRIMARY KEY,
+    habitation_name TEXT NOT NULL,
+    is_pvtg INTEGER NOT NULL,
+    community_access_planned INTEGER NOT NULL,
+    pvtg_fully_partial INTEGER NOT NULL,
+    pvtg_households INTEGER NOT NULL,
+    total_no_households INTEGER NOT NULL,
+    total_no_house_connection INTEGER NOT NULL,
+    is_pvtg_given_by_mota INTEGER NOT NULL
+)
+TABLE source_type_categories (
+    source_type_category_id INTEGER PRIMARY KEY,
+    description TEXT NOT NULL
+)
+TABLE source_types (
+    source_type_id INTEGER PRIMARY KEY,
+    description TEXT NOT NULL
+)
+TABLE storage_structure_types (
+    storage_structure_type_id INTEGER PRIMARY KEY,
+    description TEXT NOT NULL
+)
+TABLE categories (
+    category_id INTEGER PRIMARY KEY,
+    description TEXT NOT NULL
+)
+TABLE water_sources (
+    source_id INTEGER PRIMARY KEY,
+    location TEXT,
+    source_type_category_id INTEGER,
+    source_type_id INTEGER,
+    response_on TEXT,
+    scheme_id INTEGER,
+    latitude TEXT,
+    longitude TEXT,
+    pws_fhtc_status INTEGER
+)
+TABLE schemes (
+    scheme_id INTEGER PRIMARY KEY,
+    scheme_name TEXT,
+    category TEXT,
+    no_of_villages INTEGER,
+    household_planned INTEGER,
+    fhtc_provided INTEGER,
+    is_pws INTEGER,
+    fhtc_scheme TEXT,
+    is_jjm INTEGER,
+    sanction_year TEXT,
+    type TEXT,
+    work_order_date TEXT,
+    status TEXT,
+    physical_progress_in_percentage REAL,
+    handed_over_community_status TEXT,
+    handed_over_community_date TEXT,
+    estimated_cost REAL,
+    csr_donation REAL,
+    om_cost REAL,
+    expenditure REAL,
+    total_central_expenditure REAL,
+    central_expenditure_sc REAL,
+    central_expenditure_st REAL,
+    central_expenditure_gen REAL,
+    total_state_expenditure REAL,
+    state_expenditure_sc REAL,
+    state_expenditure_st REAL,
+    state_expenditure_gen REAL,
+    total_world_bank_expenditure REAL,
+    total_community_expenditure REAL,
+    total_csr_expenditure REAL,
+    total_other_expenditure REAL,
+    total_expenditure_during_JJM REAL,
+    latitude REAL NOT NULL,
+    longitude REAL NOT NULL,
+    location TEXT NOT NULL
+)
+TABLE scheme_assets (
+    id INTEGER PRIMARY KEY,
+    habitation_id INTEGER,
+    scheme_id INTEGER,
+    scheme_name TEXT,
+    latitude REAL,
+    longitude REAL,
+    location TEXT,
+    category_id INTEGER,
+    FOREIGN KEY (habitation_id) REFERENCES habitations(habitation_id),
+    FOREIGN KEY (scheme_id) REFERENCES schemes(scheme_id),
+    FOREIGN KEY (category_id) REFERENCES categories(category_id)
+)
+TABLE district_state_mapping (
+    district_id INTEGER PRIMARY KEY,
+    state_id INTEGER,
+    FOREIGN KEY (district_id) REFERENCES districts(district_id),
+    FOREIGN KEY (state_id) REFERENCES states(state_id)
+)
+TABLE block_district_mapping (
+    block_id INTEGER PRIMARY KEY,
+    district_id INTEGER,
+    FOREIGN KEY (block_id) REFERENCES blocks(block_id),
+    FOREIGN KEY (district_id) REFERENCES districts(district_id)
+)
+TABLE block_division_mapping (
+    block_id INTEGER PRIMARY KEY,
+    division_id INTEGER,
+    FOREIGN KEY (block_id) REFERENCES blocks(block_id),
+    FOREIGN KEY (division_id) REFERENCES divisions(division_id)
+)
+TABLE panchayat_block_mapping (
+    panchayat_id INTEGER PRIMARY KEY,
+    block_id INTEGER,
+    FOREIGN KEY (panchayat_id) REFERENCES panchayats(panchayat_id),
+    FOREIGN KEY (block_id) REFERENCES blocks(block_id)
+)
+TABLE village_panchayat_mapping (
+    village_id INTEGER PRIMARY KEY,
+    panchayat_id INTEGER,
+    FOREIGN KEY (village_id) REFERENCES villages(village_id),
+    FOREIGN KEY (panchayat_id) REFERENCES panchayats(panchayat_id)
+)
+TABLE habitation_village_mapping (
+    habitation_id INTEGER PRIMARY KEY,
+    village_id INTEGER,
+    FOREIGN KEY (habitation_id) REFERENCES habitations(habitation_id),
+    FOREIGN KEY (village_id) REFERENCES villages(village_id)
+)
+TABLE source_habitation_mapping (
+    source_id INTEGER PRIMARY KEY,
+    habitation_id INTEGER,
+    FOREIGN KEY (source_id) REFERENCES water_sources(source_id),
+    FOREIGN KEY (habitation_id) REFERENCES habitations(habitation_id)
+)
+TABLE scheme_village_mapping (
+    scheme_id INTEGER,
+    village_id INTEGER,
+    PRIMARY KEY (scheme_id, village_id),
+    FOREIGN KEY (scheme_id) REFERENCES schemes(scheme_id),
+    FOREIGN KEY (village_id) REFERENCES villages(village_id)
+)
+TABLE scheme_division_mapping (
+    scheme_id INTEGER PRIMARY KEY,
+    division_id INTEGER,
+    FOREIGN KEY (scheme_id) REFERENCES schemes(scheme_id),
+    FOREIGN KEY (division_id) REFERENCES divisions(division_id)
+)
+TABLE source_type_source_type_category_mapping (
+    source_type_id INTEGER PRIMARY KEY,
+    source_type_category_id INTEGER,
+    FOREIGN KEY (source_type_id) REFERENCES source_types(source_type_id),
+    FOREIGN KEY (source_type_category_id) REFERENCES source_type_categories(source_type_category_id)
+)
+TABLE wtps (
+    wtp_id INTEGER PRIMARY KEY,
+    wtp_name INTEGER NOT NULL
+)
+TABLE labs (
+    lab_id INTEGER PRIMARY KEY,
+    lab_name TEXT NOT NULL,
+    lab_type TEXT NOT NULL,
+    lab_group TEXT NOT NULL,
+    latitude REAL,
+    longitude REAL,
+    wtp_id INTEGER NOT NULL,
+    is_in_house INTEGER NOT NULL,
+    FOREIGN KEY (wtp_id) REFERENCES wtps(wtp_id)
+)
+TABLE parameters (
+    parameterid INTEGER PRIMARY KEY,
+    parameter_name TEXT NOT NULL,
+    measurement_unit TEXT NOT NULL,
+    acceptable_limit REAL NOT NULL,
+    permissible_limit TEXT NOT NULL,
+    value_type TEXT NOT NULL,
+    value_type_description TEXT NOT NULL,
+    public_rate INTEGER NOT NULL,
+    department_rate INTEGER NOT NULL,
+    commercial_rate INTEGER NOT NULL,
+    test_parameter_type TEXT NOT NULL
+)
+TABLE types (
+    type_id INTEGER PRIMARY KEY,
+    type_name TEXT NOT NULL,
+    description TEXT NOT NULL
+)
+TABLE wtp_village_mapping (
+    wtp_id INTEGER PRIMARY KEY,
+    village_id INTEGER,
+    FOREIGN KEY (wtp_id) REFERENCES wtps(wtp_id),
+    FOREIGN KEY (village_id) REFERENCES villages(village_id)
+)
+TABLE lab_village_mapping (
+    lab_id INTEGER,
+    village_id INTEGER,
+    FOREIGN KEY (lab_id) REFERENCES labs(lab_id),
+    FOREIGN KEY (village_id) REFERENCES villages(village_id)
+)
+"""
+system_prompt = f"""
+You are a precise SQL query generator assistant working with the database schema below.
+Only use the tables and columns explicitly provided in the schema when generating SQL.
+Schema definition:
+{schema}
+Guidelines:
+- Use the correct primary and foreign key relationships.
+- Do not invent tables or columns not listed in the schema.
+- If the natural language question is ambiguous, make a reasonable assumption about the intent.
+- Output only the final SQL query. Do not add any explanations or commentary.
+Instructions: The user question will be provided after this prompt. Write the SQL query that answers it.
+"""