Spaces:

Invicto69
/

SQLchat2

Sleeping

App Files Files Community

Invicto69 commited on May 1, 2025

Commit

fb30fd7

verified ·

1 Parent(s): 355116c

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (5) hide show

app.py +124 -0
readme.md +79 -0
requirements.txt +6 -3
utils.py +154 -0
var.py +62 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from typing import Generator
+from utils import validate_uri, extract_code_blocks, get_info_sqlalchemy
+from langchain_community.utilities import SQLDatabase
+from var import system_prompt, markdown_info, query_output
+import streamlit as st
+from openai import OpenAI
+st.set_page_config(layout="wide")
+# Initialize chat history and selected model
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+    st.session_state.sql_result = []
+if "selected_model" not in st.session_state:
+    st.session_state.selected_model = None
+st.markdown("# SQL Chat")
+st.sidebar.title("Settings")
+base_url = st.sidebar.text_input("Base URL", help="OpenAI compatible API")
+api_key = st.sidebar.text_input("API Key")
+model = st.sidebar.text_input("Model ID")
+if st.session_state.selected_model != model:
+    st.session_state.messages = []
+    st.session_state.sql_result = []
+    st.session_state.selected_model = model
+uri = st.sidebar.text_input("Enter SQL Database URI")
+if not validate_uri(uri):
+    st.sidebar.error("Enter valid URI")
+else:
+    st.sidebar.success("URI is valid")
+    db_info = get_info_sqlalchemy(uri)
+    markdown_info = markdown_info.format(**db_info)
+    with st.expander("SQL Database Info"):
+        st.markdown(markdown_info)
+    system_prompt = system_prompt.format(markdown_info = markdown_info)
+if base_url and api_key and model and uri:
+    client = OpenAI(
+        base_url=base_url,
+        api_key=api_key,
+    )
+    db = SQLDatabase.from_uri(uri)
+    avatar = {"user": '👨‍💻', "assistant": '🤖', "executor": '🛢'}
+    # Display chat messages from history on app rerun
+    for i, message in enumerate(st.session_state.messages):
+        with st.chat_message(message["role"], avatar=avatar[message["role"]]):
+            st.markdown(message["content"])
+        if (i+1)%2 == 0:
+            with st.chat_message("SQL Executor", avatar=avatar["executor"]):
+                st.markdown(st.session_state.sql_result[i//2])
+    def generate_chat_responses(chat_completion) -> Generator[str, None, None]:
+        """Yield chat response content from the Groq API response."""
+        for chunk in chat_completion:
+            if chunk.choices[0].delta.content:
+                yield chunk.choices[0].delta.content
+    if prompt := st.chat_input("Enter your prompt here..."):
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user", avatar=avatar["user"]):
+            st.markdown(prompt)
+        # Fetch response from Groq API
+        try:
+            chat_completion = client.chat.completions.create(
+                model=model,
+                messages=[{
+                        "role": "system",
+                        "content": system_prompt
+                    },
+                ]+
+                [
+                    {
+                        "role": m["role"],
+                        "content": m["content"]
+                    }
+                    for m in st.session_state.messages[-8:]
+                ],
+                max_tokens=3000,
+                stream=True
+            )
+            # Use the generator function with st.write_stream
+            with st.chat_message("SQL Assistant", avatar=avatar["assistant"]):
+                chat_responses_generator = generate_chat_responses(chat_completion)
+                llm_response = st.write_stream(chat_responses_generator)
+            with st.chat_message("SQL Executor", avatar=avatar["executor"]):
+                query = extract_code_blocks(llm_response)
+                result = db.run(query[0])
+                query_response = st.write(query_output.format(result=result))
+        except Exception as e:
+            st.error(e, icon="🚨")
+        if len(str(result)) > 1000:
+            query_output_truncated = query_output.format(result=result)[:500]+query_output.format(result=result)[-500:]
+        else:
+            query_output_truncated = query_output.format(result=result)
+        st.session_state.sql_result.append(query_output_truncated)
+        # Append the llm response to session_state.messages
+        if isinstance(llm_response, str):
+            st.session_state.messages.append(
+                {"role": "assistant", "content": llm_response})
+        else:
+            # Handle the case where llm_response is not a string
+            combined_response = "\n".join(str(item) for item in llm_response)
+            st.session_state.messages.append(
+                {"role": "assistant", "content": combined_response})
+    st.sidebar.button("Clear Chat History", on_click=lambda: st.session_state.messages.clear() and st.session_state.sql_result.clear())

readme.md ADDED Viewed

	@@ -0,0 +1,79 @@

+# SQLchat
+This project is a **SQL Chatbot** built with **LangChain** and **Streamlit**, designed to generate SQL queries and execute queries
+based on database table schemas and structure. The chatbot can interact with users to understand their requirements
+and translate them into SQL queries, leveraging relational database information provided via URI and schema definitions.
+## Features
+- **SQL Query Generator**: Automatically generates SQL queries based on user inputs and database structure.
+- **SQL Query Execution**: Automatically executes SQL queries generated by chatbot.
+- **Interactive Chat Interface**: Built with Streamlit for a user-friendly conversational experience.
+- **Database Schema Integration**: Parses table schemas from a database URI to provide accurate SQL generation capabilities.
+- **Customizable LLM Configuration**: Supports various large language models (LLMs) for generating responses.
+## Installation
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/arthiondaena/SQLchat.git
+   cd SQLchat
+   ```
+2. Set up a virtual environment:
+   ```bash
+   python -m venv venv
+   source venv/bin/activate    # On Windows: venv\Scripts\activate
+   ```
+3. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+## Usage
+Run the application using Streamlit:
+```bash
+streamlit run app.py
+```
+This will launch the chatbot interface in your default web browser. The chatbot can then process user inputs and generate SQL queries based on the database schema.
+## Setup
+1. **Configure Database Connection**:
+   - Set up the `URI` configuration in the streamlit app to connect to your relational database.
+   - Ensure the database has the necessary permissions to allow schema queries.
+2. **Table Schemas**:
+   - The chatbot extracts table structures and schemas from the database for generating SQL queries. Make sure the database contains valid schema definitions.
+3. **API Key Configuration**:
+   - Provide your Groq API key for LLM integration within the script.
+4. **System Prompt Customization**:
+   - Adjust the instructions as per your specific SQL generation use case.
+   - The chatbot can remember upto last 4 conversations.
+## Features in Detail
+1. **SQL Query Generation**:
+   - The chatbot uses relational database schemas to intelligently generate SQL queries.
+   - Supports basic and complex queries tailored to the provided database structure.
+2. **Database Schema Utilization**:
+   - Extracts table information (columns, types, relationships) from the connected database.
+   - Leverages this knowledge to produce highly precise SQL queries.
+3. **Customizable Model Prompts**:
+   - Custom system prompts and instructions can be added to suit diverse database use cases.
+## Example Workflow
+1. Connect the chatbot to your database by specifying the database URI.
+2. Provide the chatbot with your SQL query requirement in plain language (e.g., "Fetch the top 10 customers by revenue").
+3. The chatbot generates and returns an accurate SQL query based on the schema.

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-altair
-pandas
-streamlit

+groq
+langchain
+langchain[groq]
+streamlit
+langchain_community
+psycopg2

utils.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import requests
+from langchain_community.utilities import SQLDatabase
+from langchain_community.tools.sql_database.tool import ListSQLDatabaseTool, InfoSQLDatabaseTool
+from sqlalchemy import (
+    create_engine,
+    MetaData,
+    inspect,
+    Table,
+    select,
+    distinct
+)
+from sqlalchemy.schema import CreateTable
+from sqlalchemy.exc import ProgrammingError
+from sqlalchemy.engine import Engine
+import re
+def get_all_groq_model(api_key:str=None) -> list:
+    """Uses Groq API to fetch all the available models."""
+    if api_key is None:
+        raise ValueError("API key is required")
+    url = "https://api.groq.com/openai/v1/models"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    response = requests.get(url, headers=headers)
+    data = response.json()['data']
+    model_ids = [model['id'] for model in data]
+    return model_ids
+def validate_api_key(api_key:str) -> bool:
+    """Validates the Groq API key using the get_all_groq_model function."""
+    if len(api_key) == 0:
+        return False
+    try:
+        get_all_groq_model(api_key=api_key)
+        return True
+    except Exception as e:
+        return False
+def validate_uri(uri:str) -> bool:
+    """Validates the SQL Database URI using the SQLDatabase.from_uri function."""
+    try:
+        SQLDatabase.from_uri(uri)
+        return True
+    except Exception as e:
+        return False
+def get_info(uri:str) -> dict[str, str] | None:
+    """Gets the dialect name, accessible tables and table schemas using the SQLDatabase toolkit"""
+    db = SQLDatabase.from_uri(uri)
+    dialect = db.dialect
+    # List all the tables accessible to the user.
+    access_tables = ListSQLDatabaseTool(db=db).invoke("")
+    # List the table schemas of all the accessible tables.
+    tables_schemas = InfoSQLDatabaseTool(db=db).invoke(access_tables)
+    return {'sql_dialect': dialect, 'tables': access_tables, 'tables_schema': tables_schemas}
+def get_sample_rows(engine:Engine, table:Table, row_count: int = 3) -> str:
+    """Gets the sample rows of a table using the SQLAlchemy engine"""
+    # build the select command
+    command = select(table).limit(row_count)
+    # save the columns in string format
+    columns_str = "\t".join([col.name for col in table.columns])
+    try:
+        # get the sample rows
+        with engine.connect() as connection:
+            sample_rows_result = connection.execute(command)  # type: ignore
+            # shorten values in the sample rows
+            sample_rows = list(
+                map(lambda ls: [str(i)[:100] for i in ls], sample_rows_result)
+            )
+        # save the sample rows in string format
+        sample_rows_str = "\n".join(["\t".join(row) for row in sample_rows])
+    # in some dialects when there are no rows in the table a
+    # 'ProgrammingError' is returned
+    except ProgrammingError:
+        sample_rows_str = ""
+    return (
+        f"{row_count} rows from {table.name} table:\n"
+        f"{columns_str}\n"
+        f"{sample_rows_str}"
+    )
+def get_unique_values(engine:Engine, table:Table) -> str:
+    """Gets the unique values of each column in a table using the SQLAlchemy engine"""
+    unique_values = {}
+    for column in table.c:
+        command = select(distinct(column))
+        try:
+            # get the sample rows
+            with engine.connect() as connection:
+                result = connection.execute(command)  # type: ignore
+                # shorten values in the sample rows
+                unique_values[column.name] = [str(u) for u in result]
+            # save the sample rows in string format
+            # sample_rows_str = "\n".join(["\t".join(row) for row in sample_rows])
+            # in some dialects when there are no rows in the table a
+            # 'ProgrammingError' is returned
+        except ProgrammingError:
+            sample_rows_str = ""
+    output_str = f"Unique values of each column in {table.name}: \n"
+    for column, values in unique_values.items():
+        output_str += f"{column} has {len(values)} unique values: {' '.join(values[:20])}"
+        if len(values) > 20:
+            output_str += ", ...."
+        output_str += "\n"
+    return output_str
+def get_info_sqlalchemy(uri:str) -> dict[str, str] | None:
+    """Gets the dialect name, accessible tables and table schemas using the SQLAlchemy engine"""
+    engine = create_engine(uri)
+    # Get dialect name using inspector
+    inspector = inspect(engine)
+    dialect = inspector.dialect.name
+    # Metadata for tables and columns
+    m = MetaData()
+    m.reflect(engine)
+    tables = {}
+    for table in m.tables.values():
+        tables[table.name] = str(CreateTable(table).compile(engine)).rstrip()
+        tables[table.name] += "\n\n/*"
+        tables[table.name] += "\n" + get_sample_rows(engine, table)+"\n"
+        tables[table.name] += "\n" + get_unique_values(engine, table)+"\n"
+        tables[table.name] += "*/"
+    return {'sql_dialect': dialect, 'tables': ", ".join(tables.keys()), 'tables_schema': "\n\n".join(tables.values())}
+def extract_code_blocks(text):
+    pattern = r"```(?:\w+)?\n(.*?)\n```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    return matches
+if __name__ == "__main__":
+    from dotenv import load_dotenv
+    import os
+    load_dotenv()
+    uri = os.getenv("POSTGRES_URI")
+    print(get_info_sqlalchemy(uri))

var.py ADDED Viewed

	@@ -0,0 +1,62 @@

+groq_models = ['llama-3.3-70b-versatile', 'gemma2-9b-it', 'llama-3.2-3b-preview', 'deepseek-r1-distill-llama-70b', 'qwen-2.5-coder-32b',
+               'mixtral-8x7b-32768', 'llama-3.1-8b-instant', 'llama-3.2-1b-preview', 'allam-2-7b', 'qwen-qwq-32b', 'llama3-70b-8192',
+               'mistral-saba-24b', 'deepseek-r1-distill-qwen-32b', 'qwen-2.5-32b', 'llama-3.3-70b-specdec', 'llama3-8b-8192', 'llama-guard-3-8b']
+db_info = {'sql_dialect': '', 'tables': '', 'tables_schema': ''}
+markdown_info = """
+**SQL Dialect**: {sql_dialect}\n
+**Tables**: {tables}\n
+**Tables Schema**:
+```sql
+{tables_schema}
+```
+"""
+system_prompt = """
+You are an AI assistant specialized in generating optimized SQL queries based on user instructions. \
+You have access to the database schema provided in a structured Markdown format. Use this schema to ensure \
+correctness, efficiency, and security in your SQL queries.\
+## SQL Database Info
+{markdown_info}
+---
+## Query Generation Guidelines
+1. **Ensure Query Validity**: Use only the tables and columns defined in the schema.
+2. **Optimize Performance**: Prefer indexed columns for filtering, avoid `SELECT *` where specific columns suffice.
+3. **Security Best Practices**: Always use parameterized queries or placeholders instead of direct user inputs.
+4. **Context Awareness**: Understand the intent behind the query and generate the most relevant SQL statement.
+5. **Formatting**: Return queries in a clean, well-structured format with appropriate indentation.
+6. **Commenting**: Include comments in complex queries to explain logic when needed.
+7. **Result**: Don't return the result of the query, return only the SQL query.
+8. **Optimal**: Try to generate query which is optimal and not brute force.
+9. **Single query**: Generate a best single SQL query for the user input.'
+10. **Comment**: Include comments in the query to explain the logic behind it.
+---
+## Expected Output Format
+The SQL query should be returned as a formatted code block:
+```sql
+-- Get all completed orders with user details
+-- Comment explaining the logic.
+SELECT orders.id, users.name, users.email, orders.amount, orders.created_at
+FROM orders
+JOIN users ON orders.user_id = users.id
+WHERE orders.status = 'completed'
+ORDER BY orders.created_at DESC;
+```
+If the user's request is ambiguous, ask clarifying questions before generating the query.
+"""
+query_output = """
+**The result of query execution:**
+```sql
+{result}
+```
+"""