Spaces:

datajoi
/

DataViz-Agent

Runtime error

App Files Files Community

Muhammad Mustehson commited on Jan 16

Commit

4a84072

1 Parent(s): 332cf4d

Update Old Code

Browse files

Files changed (9) hide show

.gitignore +1 -0
__pycache__/prompt.cpython-311.pyc +0 -0
app.py +86 -103
requirements.txt +7 -8
src/__init__.py +0 -0
src/client.py +120 -0
src/models.py +142 -0
src/pipelines.py +401 -0
src/utils.py +57 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

__pycache__/prompt.cpython-311.pyc DELETED Viewed

Binary file (2.63 kB)

app.py CHANGED Viewed

@@ -1,47 +1,35 @@
 import os
 import duckdb
 import gradio as gr
 import matplotlib.pyplot as plt
-from transformers import HfEngine, ReactCodeAgent
-from transformers.agents import Tool
-from langsmith import traceable
-from langchain import hub
-# Height of the Tabs Text Area
 TAB_LINES = 8
-#----------CONNECT TO DATABASE----------
-md_token = os.getenv('MD_TOKEN')
-conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
-#---------------------------------------
-#-------LOAD HUGGINGFACE MODEL-------
-models = ["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3-70B-Instruct",
-          "meta-llama/Llama-3.1-70B-Instruct"]
-model_loaded = False
-for model in models:
-  try:
-      llm_engine = HfEngine(model=model)
-      info = llm_engine.client.get_endpoint_info()
-      model_loaded = True
-      break
-  except Exception as e:
-      print(f"Error for model {model}: {e}")
-      continue
-if not model_loaded:
-    gr.Warning(f"❌ None of the model form {models} are available. {e}")
-#---------------------------------------
-#-----LOAD PROMPT FROM LANCHAIN HUB-----
-prompt = hub.pull("viz-prompt")
-#-------------------------------------
-#--------------ALL UTILS----------------
 def get_schemas():
     schemas = conn.execute("""
     SELECT DISTINCT schema_name
@@ -50,22 +38,26 @@ def get_schemas():
     """).fetchall()
     return [item[0] for item in schemas]
-# Get Tables
 def get_tables(schema_name):
-    tables = conn.execute(f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'").fetchall()
     return [table[0] for table in tables]
-# Update Tables
 def update_tables(schema_name):
     tables = get_tables(schema_name)
     return gr.update(choices=tables)
-# Get Schema
 def get_table_schema(table):
-    result = conn.sql(f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';").df()
-    ddl_create = result.iloc[0,0]
-    parent_database = result.iloc[0,1]
-    schema_name = result.iloc[0,2]
     full_path = f"{parent_database}.{schema_name}.{table}"
     if schema_name != "main":
         old_path = f"{schema_name}.{table}"
@@ -75,62 +67,39 @@ def get_table_schema(table):
     return ddl_create, full_path
-class SQLExecutorTool(Tool):
-    name = "sql_engine"
-    inputs = {
-        "query": {
-            "type": "text",
-            "description": f"The query to perform. This should be correct DuckDB SQL.",
-        }
-    }
-    description = """Allows you to perform SQL queries on the table. Returns a pandas dataframe representation of the result."""
-    output_type = "pandas.core.frame.DataFrame"
-    def forward(self, query: str) -> str:
-        output_df = conn.sql(query).df()
-        return output_df
-tool = SQLExecutorTool()
-def process_outputs(output) :
-    return {
-        'sql': output.get('sql', None),
-        'code': output.get('code', None)
-    }
-@traceable(process_outputs=process_outputs)
-def get_visualization(question, schema, table_name):
-    agent = ReactCodeAgent(tools=[tool], llm_engine=llm_engine, add_base_tools=True,
-                           additional_authorized_imports=['matplotlib.pyplot',
-                                                 'pandas', 'plotly.express',
-                                                 'seaborn'], max_iterations=10)
-    results = agent.run(
-        task= prompt.format(question=question, schema=schema, table_name=table_name)
-    )
-    return results
-#---------------------------------------
 def main(table, text_query):
-    # Empty Fig
     fig, ax = plt.subplots()
-    ax.set_axis_off()
-    schema, table_name = get_table_schema(table)
     try:
-        output = get_visualization(question=text_query, schema=schema, table_name=table_name)
-        fig = output.get('fig', None)
-        generated_sql = output.get('sql', None)
-        data = output.get('data', None)
     except Exception as e:
         gr.Warning(f"❌ Unable to generate the visualization. {e}")
-    return fig, generated_sql, data
 custom_css = """
 .gradio-container {
@@ -150,7 +119,9 @@ custom_css = """
 }
 """
-with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"), css=custom_css) as demo:
     gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
     gr.Markdown("""
@@ -162,13 +133,18 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
     """)
     with gr.Row():
         with gr.Column(scale=1):
-            schema_dropdown = gr.Dropdown(choices=get_schemas(), label="Select Schema", interactive=True)
-            tables_dropdown = gr.Dropdown(choices=[], label="Available Tables", value=None)
         with gr.Column(scale=2):
-            query_input = gr.Textbox(lines=3, label="Text Query", placeholder="Enter your text query here...")
             with gr.Row():
                 with gr.Column(scale=7):
                     pass
@@ -178,18 +154,25 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
     with gr.Tabs():
         with gr.Tab("Plot"):
             result_plot = gr.Plot()
-        with gr.Tab("SQL"):
-            generated_sql = gr.Textbox(lines=TAB_LINES, label="Generated SQL", value="", interactive=False,
-                                              autoscroll=False)
-        with gr.Tab("Data"):
             data = gr.Dataframe(label="Data", interactive=False)
-        schema_dropdown.change(update_tables, inputs=schema_dropdown, outputs=tables_dropdown)
-        generate_query_button.click(main, inputs=[tables_dropdown, query_input], outputs=[result_plot, generated_sql, data])
 if __name__ == "__main__":
     demo.launch(debug=True)

+import logging
 import os
 import duckdb
 import gradio as gr
 import matplotlib.pyplot as plt
+import pandas as pd
+from src.client import LLMChain
+from src.models import Charts, TableData
+from src.pipelines import SQLVizChain
+from src.utils import plot_chart
+MD_TOKEN = os.getenv("MD_TOKEN")
+conn = duckdb.connect(f"md:my_db?motherduck_token={MD_TOKEN}", read_only=True)
+LEVEL = "INFO" if not os.getenv("ENV") == "PROD" else "WARNING"
 TAB_LINES = 8
+logging.basicConfig(
+    level=getattr(logging, LEVEL, logging.INFO),
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+def _load_pipeline():
+    return SQLVizChain(duckdb=conn, chain=LLMChain())
+pipeline = _load_pipeline()
 def get_schemas():
     schemas = conn.execute("""
     SELECT DISTINCT schema_name
     """).fetchall()
     return [item[0] for item in schemas]
 def get_tables(schema_name):
+    tables = conn.execute(
+        f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
+    ).fetchall()
     return [table[0] for table in tables]
 def update_tables(schema_name):
     tables = get_tables(schema_name)
     return gr.update(choices=tables)
 def get_table_schema(table):
+    result = conn.sql(
+        f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';"
+    ).df()
+    ddl_create = result.iloc[0, 0]
+    parent_database = result.iloc[0, 1]
+    schema_name = result.iloc[0, 2]
     full_path = f"{parent_database}.{schema_name}.{table}"
     if schema_name != "main":
         old_path = f"{schema_name}.{table}"
     return ddl_create, full_path
 def main(table, text_query):
     fig, ax = plt.subplots()
+    ax.set_axis_off()
+    schema, _ = get_table_schema(table)
     try:
+        results = pipeline.run(user_question=text_query, context=schema)
+        chart_data = results["chart_data"]
+        chart_config = results["chart_config"]
+        chart_type = results["chart_type"]
+        generated_sql = results["sql_config"]["sql_query"]
+        if not chart_type and chart_data is not None:
+            if isinstance(chart_data, TableData):
+                data = pd.DataFrame(chart_data.model_dump(exclude_none=True))
+            return (fig, generated_sql, data)
+        if chart_type is not None and chart_data is not None:
+            if isinstance(chart_data, Charts):
+                chart_dict = chart_data.model_dump(exclude_none=True).get(chart_type)
+                data = pd.DataFrame(chart_dict["data"])
+            fig = plot_chart(chart_type=chart_type, data=data, **chart_config)
+            return (fig, generated_sql, data)
+        if chart_data is None:
+            return fig, generated_sql, None
     except Exception as e:
+        logger.error(e)
         gr.Warning(f"❌ Unable to generate the visualization. {e}")
+    return fig, None, None
 custom_css = """
 .gradio-container {
 }
 """
+with gr.Blocks(
+    theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"), css=custom_css
+) as demo:
     gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
     gr.Markdown("""
     """)
     with gr.Row():
         with gr.Column(scale=1):
+            schema_dropdown = gr.Dropdown(
+                choices=get_schemas(), label="Select Schema", interactive=True
+            )
+            tables_dropdown = gr.Dropdown(
+                choices=[], label="Available Tables", value=None
+            )
         with gr.Column(scale=2):
+            query_input = gr.Textbox(
+                lines=3, label="Text Query", placeholder="Enter your text query here..."
+            )
             with gr.Row():
                 with gr.Column(scale=7):
                     pass
     with gr.Tabs():
         with gr.Tab("Plot"):
             result_plot = gr.Plot()
+        with gr.Tab("SQL"):
+            generated_sql = gr.Textbox(
+                lines=TAB_LINES,
+                label="Generated SQL",
+                value="",
+                interactive=False,
+                autoscroll=False,
+            )
+        with gr.Tab("Data"):
             data = gr.Dataframe(label="Data", interactive=False)
+        schema_dropdown.change(
+            update_tables, inputs=schema_dropdown, outputs=tables_dropdown
+        )
+        generate_query_button.click(
+            main,
+            inputs=[tables_dropdown, query_input],
+            outputs=[result_plot, generated_sql, data],
+        )
 if __name__ == "__main__":
     demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -1,9 +1,8 @@
-torch
-seaborn
-plotly
 huggingface_hub
-accelerate==0.34.2
-transformers==4.44.2
-duckdb==1.1.1
-langsmith==0.1.135
-langchain==0.3.4

 huggingface_hub
+duckdb
+pandas
+pydantic
+python-dotenv
+gradio
+pandas
+matplotlib

src/__init__.py ADDED Viewed

File without changes

src/client.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import json
+import logging
+import os
+from dotenv import load_dotenv
+from huggingface_hub import InferenceClient
+from pydantic import BaseModel
+load_dotenv()
+logger = logging.getLogger(__name__)
+MAX_RESPONSE_TOKENS = 2048
+TEMPERATURE = 0.9
+models = json.loads(os.getenv("MODEL_NAMES"))
+providers = json.loads(os.getenv("PROVIDERS"))
+EMB_MODEL = os.getenv("EMB_MODEL")
+def _engine_working(engine: InferenceClient) -> bool:
+    try:
+        engine.chat_completion([{"role": "user", "content": "ping"}], max_tokens=1)
+        logger.info("Engine is Working.")
+        return True
+    except Exception as e:
+        logger.exception(f"Engine is not working: {e}")
+        return False
+def _load_llm_client() -> InferenceClient:
+    """
+    Attempts to load the provided model from the huggingface endpoint.
+    Returns InferenceClient if successful.
+    Raises Exception if no model is available.
+    """
+    logger.warning("Loading Model...")
+    errors = []
+    for model in models:
+        for provider in providers:
+            if isinstance(model, str):
+                try:
+                    logger.info(f"Checking model: {model} provider: {provider}")
+                    client = InferenceClient(
+                        model=model,
+                        timeout=15,
+                        provider=provider,
+                    )
+                    if _engine_working(client):
+                        logger.info(
+                            f"The model is loaded : {model} , provider: {provider}"
+                        )
+                        return client
+                except Exception as e:
+                    logger.error(
+                        f"Error loading model {model} provider {provider}: {e}"
+                    )
+                    errors.append(str(e))
+    raise Exception(f"Unable to load any provided model: {errors}.")
+_default_client = _load_llm_client()
+class LLMChain:
+    def __init__(self, client: InferenceClient = _default_client):
+        self.client = client
+        self.total_tokens = 0
+    def run(
+        self,
+        system_prompt: str | None = None,
+        user_prompt: str | None = None,
+        messages: list[dict] | None = None,
+        format_name: str | None = None,
+        response_format: type[BaseModel] | None = None,
+    ) -> str | dict[str, str | int | float | None] | list[str] | None:
+        try:
+            if system_prompt and user_prompt:
+                messages = [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ]
+            elif not messages:
+                raise ValueError(
+                    "Either system_prompt and user_prompt or messages must be provided."
+                )
+            llm_response = self.client.chat_completion(
+                messages=messages,
+                max_tokens=MAX_RESPONSE_TOKENS,
+                temperature=TEMPERATURE,
+                response_format=(
+                    {
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": format_name,
+                            "schema": response_format.model_json_schema(),
+                            "strict": True,
+                        },
+                    }
+                    if format_name and response_format
+                    else None
+                ),
+            )
+            self.total_tokens += llm_response.usage.total_tokens
+            analysis = llm_response.choices[0].message.content
+            if response_format:
+                analysis = json.loads(analysis)
+                fields = list(response_format.model_fields.keys())
+                if len(fields) == 1:
+                    return analysis.get(fields[0])
+                return {field: analysis.get(field) for field in fields}
+            return analysis
+        except Exception as e:
+            logger.error(f"Error during LLM calls: {e}")
+            return None

src/models.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from datetime import datetime
+from enum import Enum
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
+class SmallCardNum:
+    pass
+class Continuous:
+    pass
+class DateTime:
+    pass
+class Nominal:
+    pass
+class Route(BaseModel):
+    label: int = Field(
+        description="Classify user queries as: 0 for Irrelevant/Vague/Incomplete, 1 for Visualizable, and 2 for SQL-only."
+    )
+class SQLQueryModel(BaseModel):
+    sql_query: str = Field(..., description="SQL query to execute.")
+    explanation: str = Field(..., description="Short explanation of the SQL query.")
+class DataPoint(BaseModel):
+    x: int | float | str | None = None
+    y: int | float | str | None = None
+    bin_start: int | float | None = None
+    bin_end: int | float | None = None
+    frequency: int | float | None = None
+    @field_validator("bin_start", "bin_end", "frequency", "x", "y", mode="before")
+    @classmethod
+    def to_native(cls, field_value):
+        if field_value is not None and isinstance(
+            field_value, np.float64 | np.float32 | np.int64
+        ):
+            return float(field_value)
+        if isinstance(field_value, (datetime, np.datetime64, pd.Timestamp)):  # noqa: UP038
+            return field_value.strftime("%Y-%m-%d")
+        return field_value
+    @model_validator(mode="before")
+    @classmethod
+    def validate_keys(cls, values):
+        x, y = values.get("x"), values.get("y")
+        bin_start, bin_end, frequency = (
+            values.get("bin_start"),
+            values.get("bin_end"),
+            values.get("frequency"),
+        )
+        xy = x is not None and y is not None
+        bxy = bin_start is not None and bin_end is not None and frequency is not None
+        if not (xy or bxy):
+            raise ValueError(
+                "Invalid input: Must provide either (x, y) OR (bin_start, bin_end, frequency), but not a mix."
+            )
+        return values
+class Data(BaseModel):
+    data: list[DataPoint] = Field(default_factory=list)
+    @classmethod
+    def validate_data(cls, data):
+        try:
+            return cls(data=data)
+        except ValidationError as e:
+            raise ValueError(f"Invalid data format: {e.errors()[0]}")  # noqa: B904
+class TableData(BaseModel):
+    data: pd.DataFrame = Field(default_factory=None)
+    class Config:
+        arbitrary_types_allowed = True
+    @model_validator(mode="after")
+    def timestamp_to_str(self):
+        # Convert all datetime columns to string format
+        for col in self.data.select_dtypes(include=["datetime"]).columns:
+            if col:
+                self.data[col] = self.data[col].astype(str)
+        return self
+    def model_dump(self, *args, **kwargs):  # noqa: ARG002
+        return self.data.to_dict(orient="list")
+class Charts(BaseModel):
+    bar: Data | None = None
+    line: Data | None = None
+    pie: Data | None = None
+    hist: Data | None = None
+    @model_validator(mode="after")
+    def process_charts_data(self):
+        def stringify(data):
+            if data and data.data:
+                for point in data.data:
+                    if not isinstance(point.x, str):
+                        point.x = str(point.x)
+            return data
+        if self.bar:
+            self.bar = stringify(self.bar)
+        if self.pie:
+            self.pie = stringify(self.pie)
+        return self
+class PlotType(str, Enum):
+    bar = ("bar",)
+    line = ("line",)
+    pie = ("pie",)
+    hist = ("hist",)
+class PlotConfig(BaseModel):
+    type: PlotType = Field(
+        description="Type of plot, e.g., 'bar', 'line', 'pie'. Supported types depend on ShadCN implementation.",
+    )
+    title: str = Field(description="Title of the plot to display above the plot.")
+    x_axis_label: str = Field(description="Label for the X-axis of the plot.")
+    y_axis_label: str = Field(description="Label for the Y-axis of the plot.")
+    legend: bool = Field(
+        default=True, description="Flag to display a legend for the plot."
+    )

src/pipelines.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import logging
+import os
+from typing import Any
+import numpy as np
+import pandas as pd
+from dotenv import load_dotenv
+from duckdb import DuckDBPyConnection
+from src.models import (
+    Charts,
+    Continuous,
+    Data,
+    DateTime,
+    Nominal,
+    PlotConfig,
+    Route,
+    SmallCardNum,
+    SQLQueryModel,
+    TableData,
+)
+load_dotenv()
+logger = logging.getLogger(__name__)
+MAX_BARS_COUNT = 20
+SQL_GENERATION_RETRIES = int(os.getenv("SQL_GENERATION_RETRIES", "5"))
+SQL_PROMPT = os.getenv("SQL_PROMPT")
+USER_PROMPT = os.getenv("USER_PROMPT")
+ROUTER_SYSTEM_PROMPT = os.getenv("ROUTER_SYSTEM_PROMPT")
+CHART_CONFIG_SYSTEM_PROMPT = os.getenv("CHART_CONFIG_SYSTEM_PROMPT")
+CHART_CONFIG_USER_PROMPT = os.getenv("CHART_CONFIG_USER_PROMPT")
+class SQLPipeline:
+    def __init__(
+        self,
+        duckdb: DuckDBPyConnection,
+        chain,
+    ) -> None:
+        self._duckdb = duckdb
+        self.chain = chain
+    def generate_sql(
+        self, user_question: str, context: str, errors: str | None = None
+    ) -> str | dict[str, str | int | float | None] | list[str] | None:
+        """Generate SQL + description."""
+        user_prompt_formatted = USER_PROMPT.format(
+            question=user_question, context=context
+        )
+        if errors:
+            user_prompt_formatted += f"Carefully review the previous error or\
+            exception and rewrite the SQL so that the error does not occur again.\
+            Try a different approach or rewrite SQL if needed. Last error: {errors}"
+        sql = self.chain.run(
+            system_prompt=SQL_PROMPT,
+            user_prompt=user_prompt_formatted,
+            format_name="sql_query",
+            response_format=SQLQueryModel,
+        )
+        logger.info(f"SQL Generated Successfully: {sql}")
+        return sql
+    def run_query(self, sql_query: str) -> pd.DataFrame | None:
+        """Execute SQL and return dataframe."""
+        logger.info("Query Execution Started.")
+        return self._duckdb.query(sql_query).df()
+    def try_sql_with_retries(
+        self,
+        user_question: str,
+        context: str,
+        max_retries: int = SQL_GENERATION_RETRIES,
+    ) -> tuple[
+        str | dict[str, str | int | float | None] | list[str] | None,
+        pd.DataFrame | None,
+    ]:
+        """Try SQL generation + execution with retries."""
+        last_error = None
+        all_errors = ""
+        for attempt in range(
+            1, max_retries + 2
+        ):  # @ Since the first is normal and not consider in retries
+            try:
+                if attempt > 1 and last_error:
+                    logger.info(f"Retrying: {attempt - 1}")
+                    # Generate SQL
+                    sql = self.generate_sql(user_question, context, errors=all_errors)
+                    if not sql:
+                        return None, None
+                else:
+                    # Generate SQL
+                    sql = self.generate_sql(user_question, context)
+                    if not sql:
+                        return None, None
+                # Try executing query
+                sql_query_str = sql.get("sql_query") if isinstance(sql, dict) else sql
+                if not isinstance(sql_query_str, str):
+                    raise ValueError(
+                        f"Expected SQL query to be a string, got {type(sql_query_str).__name__}"
+                    )
+                query_df = self.run_query(sql_query_str)
+                # If execution succeeds, stop retrying or if df is not empty
+                if query_df is not None and not query_df.empty:
+                    return sql, query_df
+            except Exception as e:
+                last_error = f"\nAttempt {attempt - 1}] {type(e).__name__}: {e}"
+                logger.error(f"Error during SQL generation or execution: {last_error}")
+                all_errors += last_error
+        logger.error(f"Failed after {max_retries} attempts. Last error: {all_errors}")
+        return None, None
+class QueryRouter:
+    def __init__(self, chain) -> None:
+        self.chain = chain
+    def route_request(self, user_question: str, context: str) -> int:
+        """Route the user question to 0, 1, or 2."""
+        user_prompt_formatted = USER_PROMPT.format(
+            question=user_question, context=context
+        )
+        route = self.chain.run(
+            system_prompt=ROUTER_SYSTEM_PROMPT,
+            user_prompt=user_prompt_formatted,
+            format_name="route_queries",
+            response_format=Route,
+        )
+        logger.info(
+            f"Query routed to: {route} Where if query is routed to 0 its irrelevant, if 1 its visualizable, if 2 its only sql, and 3 if its datetime."
+        )
+        return route
+class ChartFormatter:
+    def _build_xy_data(self, label_data, value_data, limit_unique_x=False):
+        df = pd.DataFrame({"x": label_data, "y": value_data})
+        if limit_unique_x and df["x"].nunique() > MAX_BARS_COUNT:
+            df = df.head(MAX_BARS_COUNT)
+        return df.to_dict(orient="records")
+    def is_continuous(self, dtype) -> bool:
+        if pd.api.types.is_bool_dtype(dtype):
+            return False
+        return (
+            pd.api.types.is_integer_dtype(dtype)
+            or pd.api.types.is_float_dtype(dtype)
+            or pd.api.types.is_numeric_dtype(dtype)
+        )
+    def is_datetime(self, dtype) -> bool:
+        return pd.api.types.is_datetime64_any_dtype(
+            dtype
+        ) or pd.api.types.is_timedelta64_dtype(dtype)
+    def detect_dtype(self, data):
+        """Detects dtypes of columns."""
+        type_ = {}
+        for col_name in data.columns:
+            col_data = data[col_name]
+            if self.is_continuous(col_data.dtype):
+                # detect as categorical if distinct value is small
+                if isinstance(col_data, pd.Series):
+                    nuniques = col_data.nunique()
+                else:
+                    raise TypeError(f"unprocessed column type:{type(col_name)}")
+                small_cardinality_threshold = 10
+                if nuniques < small_cardinality_threshold:
+                    type_[col_name] = SmallCardNum()
+                else:
+                    type_[col_name] = Continuous()
+            elif self.is_datetime(col_data.dtype):
+                type_[col_name] = DateTime()
+            else:
+                type_[col_name] = Nominal()
+        return type_
+    def build_bar_chart(self, label_data, value_data):
+        return self._build_xy_data(label_data, value_data, limit_unique_x=True)
+    def build_line_chart(self, label_data, value_data):
+        return self._build_xy_data(label_data, value_data)
+    def build_pie_chart(self, label_data, value_data):
+        return self._build_xy_data(label_data, value_data)
+    def build_histogram(self, data):
+        range_ = (data.min(), data.max())
+        counts, bins = np.histogram(data, bins=50, range=range_)
+        return [
+            {
+                "bin_start": bins[i],
+                "bin_end": bins[i + 1],
+                "frequency": counts[i],
+            }
+            for i in range(len(counts))
+        ]
+    def format_and_select_chart(self, df: pd.DataFrame):
+        cols = df.columns.tolist()
+        dtypes = self.detect_dtype(df)
+        if len(cols) == 1:
+            col = cols[0]
+            dtype = dtypes[col]
+            if isinstance(dtype, Continuous):
+                return "hist", self.build_histogram(df[col].dropna()), dtypes
+            if isinstance(dtype, (SmallCardNum, Nominal)):
+                counts = df[col].value_counts()
+                chart = "pie" if counts.size <= 6 else "bar"
+                builder = (
+                    self.build_pie_chart if chart == "pie" else self.build_bar_chart
+                )
+                return chart, builder(counts.index, counts.values), dtypes
+        if len(cols) == 2:
+            x, y = cols
+            dtype_x = dtypes[x]
+            dtype_y = dtypes[y]
+            data_x = df[x]
+            data_y = df[y]
+            if {type(dtype_x), type(dtype_y)} == {Nominal, Continuous}:
+                label, value = (
+                    (data_x, data_y)
+                    if isinstance(dtype_x, Nominal)
+                    else (data_y, data_x)
+                )
+                formatted_data = self.build_bar_chart(label, value)
+                return "bar", formatted_data, dtypes
+            elif {type(dtype_x), type(dtype_y)} == {Continuous, Continuous}:
+                label, value = (
+                    (data_x, data_y)
+                    if isinstance(dtype_x, Continuous)
+                    else (data_y, data_x)
+                )
+                formatted_data = self.build_bar_chart(label, value)
+                return "bar", formatted_data, dtypes
+            elif {type(dtype_x), type(dtype_y)} == {SmallCardNum, Continuous}:
+                label, value = (
+                    (data_x, data_y)
+                    if isinstance(dtype_x, SmallCardNum)
+                    else (data_y, data_x)
+                )
+                formatted_data = self.build_bar_chart(label, value)
+                return "bar", formatted_data, dtypes
+            elif isinstance(dtype_x, SmallCardNum) and isinstance(
+                dtype_y, SmallCardNum
+            ):
+                formatted_data = self.build_bar_chart(data_x, data_y)
+                return "bar", formatted_data, dtypes
+            elif {type(dtype_x), type(dtype_y)} == {DateTime, Continuous}:
+                label, value = (
+                    (data_x, data_y)
+                    if isinstance(dtype_x, DateTime)
+                    else (data_y, data_x)
+                )
+                formatted_data = self.build_line_chart(label, value)
+                return "line", formatted_data, dtypes
+            elif (
+                isinstance(dtype_x, DateTime) and isinstance(dtype_y, SmallCardNum)
+            ) or (isinstance(dtype_y, DateTime) and isinstance(dtype_x, SmallCardNum)):
+                label, value = (
+                    (data_x, data_y)
+                    if isinstance(dtype_x, DateTime)
+                    else (data_y, data_x)
+                )
+                formatted_data = self.build_line_chart(label, value)
+                return "line", formatted_data, dtypes
+            elif {type(dtype_x), type(dtype_y)} == {Nominal, SmallCardNum}:
+                label, value = (
+                    (data_x, data_y)
+                    if isinstance(dtype_x, Nominal)
+                    else (data_y, data_x)
+                )
+                formatted_data = self.build_bar_chart(label, value)
+                return "bar", formatted_data, dtypes
+        return None, None, None
+class SQLVizChain:
+    def __init__(self, duckdb: DuckDBPyConnection, chain):
+        self._duckdb = duckdb
+        self.chain = chain
+        self.router = QueryRouter(chain=self.chain)
+        self.sql_generator = SQLPipeline(duckdb, chain=self.chain)
+        self.charting = ChartFormatter()
+    def create_chart_config(
+        self, query_df: pd.DataFrame, user_question: str, sql: str
+    ) -> tuple[list[dict[Any, Any]] | None, dict[str, Any] | None, str | None]:
+        """Format data for visualization and return chart config."""
+        (
+            chart_type,
+            formatted_data,
+            dtypes,
+        ) = self.charting.format_and_select_chart(df=query_df)
+        if not all([formatted_data, dtypes, chart_type]):
+            return None, None, None
+        chart_config = self.chain.run(
+            system_prompt=CHART_CONFIG_SYSTEM_PROMPT,
+            user_prompt=CHART_CONFIG_USER_PROMPT.format(
+                question=user_question,
+                sql_query=sql,
+                dtypes=dtypes,
+                chart_type=chart_type,
+            ),
+            format_name="chart_config",
+            response_format=PlotConfig,
+        )
+        logger.info(f"Chart Config Generated: {chart_config}")
+        return formatted_data, chart_config, chart_type
+    def create_viz_with_text_response(
+        self, query_df: pd.DataFrame, user_question: str, sql_config: dict[Any, Any]
+    ) -> dict[str, Any]:
+        formatted_data, chart_config, chart_type = self.create_chart_config(
+            query_df, user_question, sql_config["sql_query"]
+        )
+        table_data = TableData(data=query_df)
+        if not all([formatted_data, chart_config, chart_type]):
+            logger.info("Failed to format data or generate chart config.")
+            logger.info(f"Total Token Counts: {self.chain.total_tokens}")
+            return {
+                "chart_data": table_data,
+                "chart_config": None,
+                "chart_type": None,
+                "sql_config": sql_config,
+            }
+        chart_data = Data.validate_data(data=formatted_data)
+        if chart_config and chart_config["type"] in {"bar", "line", "pie", "hist"}:
+            data = Charts(**{chart_config["type"]: chart_data})
+        else:
+            raise ValueError(
+                "Invalid Plot Type. Must be one of 'bar', 'line', 'pie', 'hist'"
+            )
+        logger.info("Visualization Chain Completed Successfully.")
+        logger.info(f"Total Token Counts: {self.chain.total_tokens}")
+        return {
+            "chart_data": data,
+            "chart_config": chart_config,
+            "chart_type": chart_type,
+            "sql_config": sql_config,
+        }
+    def run(self, user_question: str, context: str) -> dict[str, Any]:
+        """Main pipeline: question → SQL → data → chart config."""
+        route = self.router.route_request(user_question=user_question, context=context)
+        if route == 0:
+            return {
+                "chart_data": None,
+                "chart_config": None,
+                "chart_type": None,
+                "sql_config": None,
+            }
+        sql_config, query_df = self.sql_generator.try_sql_with_retries(
+            user_question=user_question, context=context
+        )
+        if sql_config is None or query_df is None:
+            logger.info("Failed to generate or execute SQL after retries.")
+            logger.info(f"Total Token Counts: {self.chain.total_tokens}")
+            return {
+                "chart_data": None,
+                "chart_config": None,
+                "chart_type": None,
+                "sql_config": None,
+            }
+        return self.create_viz_with_text_response(
+            query_df=query_df, user_question=user_question, sql_config=sql_config
+        )

src/utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import matplotlib.pyplot as plt
+def plot_chart(
+    chart_type,
+    data,
+    title=None,
+    x_axis_label=None,
+    y_axis_label=None,
+    **kwargs,
+):
+    fig, ax = plt.subplots(figsize=(8, 5))
+    if chart_type in {"bar", "line", "pie"}:
+        if data.shape[1] < 2:
+            raise ValueError("DataFrame must have at least two columns")
+        data = data.head(20)
+        x = data.iloc[:, 0]
+        y = data.iloc[:, 1]
+    if chart_type == "bar":
+        ax.bar(x, y)
+        ax.set_xlabel(x_axis_label or data.columns[0])
+        ax.set_ylabel(y_axis_label or data.columns[1])
+    elif chart_type == "line":
+        ax.plot(x, y, marker="o")
+        ax.set_xlabel(x_axis_label or data.columns[0])
+        ax.set_ylabel(y_axis_label or data.columns[1])
+    elif chart_type == "pie":
+        ax.pie(y, labels=x, autopct="%1.1f%%")
+        ax.axis("equal")
+    elif chart_type == "hist":
+        if data.shape[1] < 3:
+            raise ValueError("Histogram DataFrame must have 3 columns")
+        bin_start = data.iloc[:, 0]
+        bin_end = data.iloc[:, 1]
+        frequency = data.iloc[:, 2]
+        widths = bin_end - bin_start
+        ax.bar(bin_start, frequency, width=widths, align="edge")
+        ax.set_xlabel(x_axis_label or "Value Range")
+        ax.set_ylabel(y_axis_label or "Frequency")
+    else:
+        plt.close(fig)
+        raise ValueError(f"Unsupported chart type: {chart_type}")
+    if title:
+        ax.set_title(title)
+    fig.tight_layout()
+    return fig